Merge branch 'master' of github.com:apple/foundationdb into refactor-fdbcli-2

This commit is contained in:
Chaoguang Lin 2021-06-04 22:57:07 +00:00
commit 11eed5bc71
136 changed files with 4469 additions and 1039 deletions

2
.gitignore vendored
View File

@ -7,7 +7,7 @@ bindings/java/foundationdb-client*.jar
bindings/java/foundationdb-tests*.jar
bindings/java/fdb-java-*-sources.jar
packaging/msi/FDBInstaller.msi
builds/
# Generated source, build, and packaging files
*.g.cpp
*.g.h

View File

@ -263,13 +263,15 @@ TEST_CASE("fdb_future_set_callback") {
&context));
fdb_error_t err = wait_future(f1);
context.event.wait(); // Wait until callback is called
if (err) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
context.event.wait();
break;
}
}
@ -515,10 +517,10 @@ TEST_CASE("write system key") {
fdb::Transaction tr(db);
std::string syskey("\xff\x02");
fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0));
tr.set(syskey, "bar");
while (1) {
fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0));
tr.set(syskey, "bar");
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
@ -949,16 +951,25 @@ TEST_CASE("fdb_transaction_clear") {
}
TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") {
insert_data(db, create_data({ { "foo", "a" } }));
insert_data(db, create_data({ { "foo", "\x00" } }));
fdb::Transaction tr(db);
int8_t param = 1;
int potentialCommitCount = 0;
while (1) {
tr.atomic_op(key("foo"), (const uint8_t*)&param, sizeof(param), FDB_MUTATION_TYPE_ADD);
if (potentialCommitCount + 1 == 256) {
// Trying to commit again might overflow the one unsigned byte we're looking at
break;
}
++potentialCommitCount;
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
if (err) {
if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
--potentialCommitCount;
}
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
@ -969,7 +980,8 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") {
auto value = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->size() == 1);
CHECK(value->data()[0] == 'b'); // incrementing 'a' results in 'b'
CHECK(uint8_t(value->data()[0]) > 0);
CHECK(uint8_t(value->data()[0]) <= potentialCommitCount);
}
TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_AND") {
@ -1139,14 +1151,19 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") {
fdb::Transaction tr(db);
char param[] = { 'a', 'd' };
int potentialCommitCount = 0;
while (1) {
tr.atomic_op(key("foo"), (const uint8_t*)"b", 1, FDB_MUTATION_TYPE_BIT_XOR);
tr.atomic_op(key("bar"), (const uint8_t*)param, 2, FDB_MUTATION_TYPE_BIT_XOR);
tr.atomic_op(key("baz"), (const uint8_t*)"d", 1, FDB_MUTATION_TYPE_BIT_XOR);
++potentialCommitCount;
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
if (err) {
if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
--potentialCommitCount;
}
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
@ -1154,6 +1171,11 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") {
break;
}
if (potentialCommitCount != 1) {
MESSAGE("Transaction may not have committed exactly once. Suppressing assertions");
return;
}
auto value = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->size() == 1);
@ -1204,13 +1226,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") {
insert_data(db, create_data({ { "foo", "f" } }));
fdb::Transaction tr(db);
int potentialCommitCount = 0;
while (1) {
tr.atomic_op(key("foo"), (const uint8_t*)"db", 2, FDB_MUTATION_TYPE_APPEND_IF_FITS);
tr.atomic_op(key("bar"), (const uint8_t*)"foundation", 10, FDB_MUTATION_TYPE_APPEND_IF_FITS);
++potentialCommitCount;
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
if (err) {
if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
--potentialCommitCount;
}
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
@ -1218,13 +1245,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") {
break;
}
auto value = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->compare("fdb") == 0);
auto value_foo = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value_foo.has_value());
value = get_value(key("bar"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->compare("foundation") == 0);
auto value_bar = get_value(key("bar"), /* snapshot */ false, {});
REQUIRE(value_bar.has_value());
if (potentialCommitCount != 1) {
MESSAGE("Transaction may not have committed exactly once. Suppressing assertions");
} else {
CHECK(value_foo.value() == "fdb");
CHECK(value_bar.value() == "foundation");
}
}
TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_MAX") {
@ -1576,7 +1608,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f1.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1587,7 +1619,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f2.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1598,7 +1630,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f3.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1609,7 +1641,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f4.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1671,7 +1703,7 @@ TEST_CASE("fdb_transaction_cancel") {
// ... until the transaction has been reset.
tr.reset();
fdb::ValueFuture f2 = tr.get("foo", /* snapshot */ false);
fdb_check(wait_future(f2));
CHECK(wait_future(f2) != 1025); // transaction_cancelled
}
TEST_CASE("fdb_transaction_add_conflict_range") {
@ -2146,22 +2178,29 @@ TEST_CASE("monitor_network_busyness") {
}
int main(int argc, char** argv) {
if (argc != 3 && argc != 4) {
if (argc < 3) {
std::cout << "Unit tests for the FoundationDB C API.\n"
<< "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient]" << std::endl;
<< "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient] [doctest args]"
<< std::endl;
return 1;
}
fdb_check(fdb_select_api_version(710));
if (argc == 4) {
if (argc >= 4) {
std::string externalClientLibrary = argv[3];
fdb_check(fdb_network_set_option(
FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast<const uint8_t*>(""), 0));
fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY,
reinterpret_cast<const uint8_t*>(externalClientLibrary.c_str()),
externalClientLibrary.size()));
if (externalClientLibrary.substr(0, 2) != "--") {
fdb_check(fdb_network_set_option(
FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast<const uint8_t*>(""), 0));
fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY,
reinterpret_cast<const uint8_t*>(externalClientLibrary.c_str()),
externalClientLibrary.size()));
}
}
/* fdb_check(fdb_network_set_option( */
/* FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE, reinterpret_cast<const uint8_t*>(""), 0)); */
doctest::Context context;
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };

View File

@ -74,3 +74,12 @@ add_custom_command(OUTPUT ${package_file}
add_custom_target(python_package DEPENDS ${package_file})
add_dependencies(python_package python_binding)
add_dependencies(packages python_package)
if (NOT WIN32 AND NOT OPEN_FOR_IDE)
add_fdbclient_test(
NAME fdbcli_tests
COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py
${CMAKE_BINARY_DIR}/bin/fdbcli
@CLUSTER_FILE@
)
endif()

View File

@ -0,0 +1,93 @@
#!/usr/bin/env python3
import sys
import subprocess
import logging
import functools
def enable_logging(level=logging.ERROR):
"""Enable logging in the function with the specified logging level
Args:
level (logging.<level>, optional): logging level for the decorated function. Defaults to logging.ERROR.
"""
def func_decorator(func):
@functools.wraps(func)
def wrapper(*args,**kwargs):
# initialize logger
logger = logging.getLogger(func.__name__)
logger.setLevel(level)
# set logging format
handler = logging.StreamHandler()
handler_format = logging.Formatter('[%(asctime)s] - %(filename)s:%(lineno)d - %(levelname)s - %(name)s - %(message)s')
handler.setFormatter(handler_format)
handler.setLevel(level)
logger.addHandler(handler)
# pass the logger to the decorated function
result = func(logger, *args,**kwargs)
return result
return wrapper
return func_decorator
def run_fdbcli_command(*args):
"""run the fdbcli statement: fdbcli --exec '<arg1> <arg2> ... <argN>'.
Returns:
string: Console output from fdbcli
"""
commands = command_template + ["{}".format(' '.join(args))]
return subprocess.run(commands, stdout=subprocess.PIPE).stdout.decode('utf-8').strip()
@enable_logging()
def advanceversion(logger):
# get current read version
version1 = int(run_fdbcli_command('getversion'))
logger.debug("Read version: {}".format(version1))
# advance version to a much larger value compared to the current version
version2 = version1 * 10000
logger.debug("Advanced to version: " + str(version2))
run_fdbcli_command('advanceversion', str(version2))
# after running the advanceversion command,
# check the read version is advanced to the specified value
version3 = int(run_fdbcli_command('getversion'))
logger.debug("Read version: {}".format(version3))
assert version3 >= version2
# advance version to a smaller value compared to the current version
# this should be a no-op
run_fdbcli_command('advanceversion', str(version1))
# get the current version to make sure the version did not decrease
version4 = int(run_fdbcli_command('getversion'))
logger.debug("Read version: {}".format(version4))
assert version4 >= version3
@enable_logging()
def maintenance(logger):
# expected fdbcli output when running 'maintenance' while there's no ongoing maintenance
no_maintenance_output = 'No ongoing maintenance.'
output1 = run_fdbcli_command('maintenance')
assert output1 == no_maintenance_output
# set maintenance on a fake zone id for 10 seconds
run_fdbcli_command('maintenance', 'on', 'fake_zone_id', '10')
# show current maintenance status
output2 = run_fdbcli_command('maintenance')
logger.debug("Maintenance status: " + output2)
items = output2.split(' ')
# make sure this specific zone id is under maintenance
assert 'fake_zone_id' in items
logger.debug("Remaining time(seconds): " + items[-2])
assert 0 < int(items[-2]) < 10
# turn off maintenance
run_fdbcli_command('maintenance', 'off')
# check maintenance status
output3 = run_fdbcli_command('maintenance')
assert output3 == no_maintenance_output
if __name__ == '__main__':
# fdbcli_tests.py <path_to_fdbcli_binary> <path_to_fdb_cluster_file>
assert len(sys.argv) == 3, "Please pass arguments: <path_to_fdbcli_binary> <path_to_fdb_cluster_file>"
# shell command template
command_template = [sys.argv[1], '-C', sys.argv[2], '--exec']
# tests for fdbcli commands
# assertions will fail if fdbcli does not work as expected
advanceversion()
maintenance()

View File

@ -717,7 +717,7 @@ namespace SummarizeTest
delegate IEnumerable<Magnesium.Event> parseDelegate(System.IO.Stream stream, string file,
bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue,
double samplingFactor = 1.0);
double samplingFactor = 1.0, Action<string> nonFatalErrorMessage = null);
static int Summarize(string[] traceFiles, string summaryFileName,
string errorFileName, bool? killed, List<string> outputErrors, int? exitCode, long? peakMemory,
@ -750,12 +750,14 @@ namespace SummarizeTest
{
try
{
// Use Action to set this because IEnumerables with yield can't have an out variable
string nonFatalParseError = null;
parseDelegate parse;
if (traceFileName.EndsWith(".json"))
parse = Magnesium.JsonParser.Parse;
else
parse = Magnesium.XmlParser.Parse;
foreach (var ev in parse(traceFile, traceFileName))
foreach (var ev in parse(traceFile, traceFileName, nonFatalErrorMessage: (x) => { nonFatalParseError = x; }))
{
Magnesium.Severity newSeverity;
if (severityMap.TryGetValue(new KeyValuePair<string, Magnesium.Severity>(ev.Type, ev.Severity), out newSeverity))
@ -876,6 +878,11 @@ namespace SummarizeTest
if (ev.Type == "StderrSeverity")
stderrSeverity = int.Parse(ev.Details.NewSeverity);
}
if (nonFatalParseError != null) {
xout.Add(new XElement("NonFatalParseError",
new XAttribute("Severity", (int)Magnesium.Severity.SevWarnAlways),
new XAttribute("ErrorMessage", nonFatalParseError)));
}
}
catch (Exception e)

View File

@ -1,4 +1,4 @@
/*
/*
* JsonParser.cs
*
* This source file is part of the FoundationDB open source project
@ -34,9 +34,10 @@ namespace Magnesium
{
static Random r = new Random();
// dummy parameter nonFatalParseError to match xml
public static IEnumerable<Event> Parse(System.IO.Stream stream, string file,
bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue,
double samplingFactor = 1.0)
double samplingFactor = 1.0, Action<string> nonFatalErrorMessage = null)
{
using (var reader = new System.IO.StreamReader(stream))
{

View File

@ -33,14 +33,29 @@ namespace Magnesium
public static IEnumerable<Event> Parse(System.IO.Stream stream, string file,
bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue,
double samplingFactor = 1.0)
double samplingFactor = 1.0, Action<string> nonFatalErrorMessage = null)
{
using (var reader = XmlReader.Create(stream))
{
reader.ReadToDescendant("Trace");
reader.Read();
foreach (var xev in StreamElements(reader))
// foreach (var xev in StreamElements(reader))
// need to be able to catch and save non-fatal exceptions in StreamElements, so use explicit iterator instead of foreach
var iter = StreamElements(reader).GetEnumerator();
while (true)
{
try {
if (!iter.MoveNext()) {
break;
}
} catch (Exception e) {
if (nonFatalErrorMessage != null) {
nonFatalErrorMessage(e.Message);
}
break;
}
var xev = iter.Current;
Event ev = null;
try
{
@ -165,28 +180,20 @@ namespace Magnesium
}
}
// throws exceptions if xml is invalid
private static IEnumerable<XElement> StreamElements(this XmlReader reader)
{
while (!reader.EOF)
{
if (reader.NodeType == XmlNodeType.Element)
{
XElement node = null;
try
{
node = XElement.ReadFrom(reader) as XElement;
}
catch (Exception) { break; }
XElement node = XElement.ReadFrom(reader) as XElement;
if (node != null)
yield return node;
}
else
{
try
{
reader.Read();
}
catch (Exception) { break; }
}
}
}

View File

@ -49,7 +49,7 @@ master_doc = 'index'
# General information about the project.
project = u'FoundationDB'
copyright = u'2013-2018 Apple, Inc and the FoundationDB project authors'
copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors'
# Load the version information from 'versions.target'
import xml.etree.ElementTree as ET

View File

@ -971,7 +971,7 @@ For example, you can change a process type or update coordinators by manipulatin
#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clearing keys is forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, indicating the source that the process's class type comes from.
#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators's network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators' network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
#. ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` Read/write. A single key, if read, will return the cluster description. Thus modifying the key will update the cluster decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. In addition, clear on the key is meaningless thus forbidden. For more details, see help text of ``fdbcli`` command ``coordinators``.
The ``<address>`` here is the network address of the corresponding process. Thus the general form is ``ip:port``.

View File

@ -121,6 +121,16 @@
"counter":0,
"roughness":0.0
},
"fetched_versions":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"fetches_from_logs":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default).
"default":{
"count":0,
@ -604,6 +614,10 @@
"data_distribution_disabled_for_rebalance":true,
"data_distribution_disabled":true,
"active_primary_dc":"pv",
"bounce_impact":{
"can_clean_bounce":true,
"reason":""
},
"configuration":{
"log_anti_quorum":0,
"log_replicas":2,
@ -668,6 +682,16 @@
"ssd-rocksdb-experimental",
"memory"
]},
"tss_count":1,
"tss_storage_engine":{
"$enum":[
"ssd",
"ssd-1",
"ssd-2",
"ssd-redwood-experimental",
"ssd-rocksdb-experimental",
"memory"
]},
"coordinators_count":1,
"excluded_servers":[
{

View File

@ -3,16 +3,29 @@ Release Notes
#############
6.3.14
======
* Fixed fdbbackup start command that automatically configures database with backup workers to only do so when using partitioned logs. `(PR #4863) <https://github.com/apple/foundationdb/pull/4863>`_
* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) <https://github.com/apple/foundationdb/pull/4774>`_
* Fix several packaging issues. The osx package should now install successfully, and the structure of the RPM and DEB packages should match that of 6.2. `(PR #4810) <https://github.com/apple/foundationdb/pull/4810>`_
* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
6.3.13
======
* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) <https://github.com/apple/foundationdb/pull/4736>`_
* The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
6.3.12
======
* Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) <https://github.com/apple/foundationdb/pull/4421>`_
* Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4526) <https://github.com/apple/foundationdb/pull/4526>`_
* As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database. `(PR #4554) <https://github.com/apple/foundationdb/pull/4554>`_
* Fix fault tolerance calculation when there are no tLogs in LogSet. `(PR #4454) <https://github.com/apple/foundationdb/pull/4454>`_
* Change client's ``iteration_progression`` size defaults from 256 to 4096 bytes for better performance. `(PR #4416) <https://github.com/apple/foundationdb/pull/4416>`_
* Add the ability to instrument java driver actions, such as ``FDBTransaction`` and ``RangeQuery``. `(PR #4385) <https://github.com/apple/foundationdb/pull/4385>`_
6.3.11
======

View File

@ -31,7 +31,9 @@ Fixes
Status
------
* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
Bindings
--------

View File

@ -496,11 +496,15 @@ void initHelp() {
helpMap["configure"] = CommandHelp(
"configure [new] "
"<single|double|triple|three_data_hall|three_datacenter|ssd|memory|memory-radixtree-beta|proxies=<PROXIES>|"
"commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
"commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*|"
"perpetual_storage_wiggle=<WIGGLE_SPEED>",
"change the database configuration",
"The `new' option, if present, initializes a new database with the given configuration rather than changing "
"the configuration of an existing one. When used, both a redundancy mode and a storage engine must be "
"specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies "
"specified.\n\ntss: when enabled, configures the testing storage server for the cluster instead."
"When used with new to set up tss for the first time, it requires both a count and a storage engine."
"To disable the testing storage server, run \"configure tss count=0\"\n\n"
"Redundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies "
"of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - "
"See the Admin Guide.\n three_datacenter - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage "
"engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small "
@ -517,8 +521,11 @@ void initHelp() {
"1, or set to -1 which restores the number of GRV proxies to the default value.\n\nlogs=<LOGS>: Sets the "
"desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of "
"logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. "
"Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the "
"FoundationDB Administration Guide for more information.");
"Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\n"
"perpetual_storage_wiggle=<WIGGLE_SPEED>: Set the value speed (a.k.a., the number of processes that the Data "
"Distributor should wiggle at a time). Currently, only 0 and 1 are supported. The value 0 means to disable the "
"perpetual storage wiggle.\n\n"
"See the FoundationDB Administration Guide for more information.");
helpMap["fileconfigure"] = CommandHelp(
"fileconfigure [new] <FILENAME>",
"change the database configuration from a file",
@ -1101,6 +1108,17 @@ void printStatus(StatusObjectReader statusObj,
if (statusObjConfig.get("log_routers", intVal))
outputString += format("\n Desired Log Routers - %d", intVal);
if (statusObjConfig.get("tss_count", intVal) && intVal > 0) {
int activeTss = 0;
if (statusObjCluster.has("active_tss_count")) {
statusObjCluster.get("active_tss_count", activeTss);
}
outputString += format("\n TSS - %d/%d", activeTss, intVal);
if (statusObjConfig.get("tss_storage_engine", strVal))
outputString += format("\n TSS Storage Engine - %s", strVal.c_str());
}
outputString += "\n Usable Regions - ";
if (statusObjConfig.get("usable_regions", intVal)) {
outputString += std::to_string(intVal);
@ -2743,6 +2761,7 @@ void configureGenerator(const char* text, const char* line, std::vector<std::str
"grv_proxies=",
"logs=",
"resolvers=",
"perpetual_storage_wiggle=",
nullptr };
arrayGenerator(text, line, opts, lc);
}

View File

@ -404,8 +404,14 @@ ACTOR Future<Void> readCommitted(Database cx,
state RangeResult values = wait(tr.getRange(begin, end, limits));
// When this buggify line is enabled, if there are more than 1 result then use half of the results
// Copy the data instead of messing with the results directly to avoid TSS issues.
if (values.size() > 1 && BUGGIFY) {
values.resize(values.arena(), values.size() / 2);
RangeResult copy;
// only copy first half of values into copy
for (int i = 0; i < values.size() / 2; i++) {
copy.push_back_deep(copy.arena(), values[i]);
}
values = copy;
values.more = true;
// Half of the time wait for this tr to expire so that the next read is at a different version
if (deterministicRandom()->random01() < 0.5)
@ -469,9 +475,15 @@ ACTOR Future<Void> readCommitted(Database cx,
state RangeResult rangevalue = wait(tr.getRange(nextKey, end, limits));
// When this buggify line is enabled, if there are more than 1 result then use half of the results
// When this buggify line is enabled, if there are more than 1 result then use half of the results.
// Copy the data instead of messing with the results directly to avoid TSS issues.
if (rangevalue.size() > 1 && BUGGIFY) {
rangevalue.resize(rangevalue.arena(), rangevalue.size() / 2);
RangeResult copy;
// only copy first half of rangevalue into copy
for (int i = 0; i < rangevalue.size() / 2; i++) {
copy.push_back_deep(copy.arena(), rangevalue[i]);
}
rangevalue = copy;
rangevalue.more = true;
// Half of the time wait for this tr to expire so that the next read is at a different version
if (deterministicRandom()->random01() < 0.5)

View File

@ -68,6 +68,7 @@ set(FDBCLIENT_SRCS
Status.h
StatusClient.actor.cpp
StatusClient.h
StorageServerInterface.cpp
StorageServerInterface.h
Subspace.cpp
Subspace.h

View File

@ -288,9 +288,12 @@ struct GetKeyServerLocationsReply {
Arena arena;
std::vector<std::pair<KeyRangeRef, vector<StorageServerInterface>>> results;
// if any storage servers in results have a TSS pair, that mapping is in here
std::vector<std::pair<UID, StorageServerInterface>> resultsTssMapping;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, results, arena);
serializer(ar, results, resultsTssMapping, arena);
}
};

View File

@ -33,12 +33,15 @@ const int MAX_CLUSTER_FILE_BYTES = 60000;
constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2);
constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
// the value of this endpoint should be stable and not change.
constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 11);
// The coordinator interface as exposed to clients
// well known endpoints published to the client.
struct ClientLeaderRegInterface {
RequestStream<struct GetLeaderRequest> getLeader;
RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
RequestStream<struct CheckDescriptorMutableRequest> checkDescriptorMutable;
ClientLeaderRegInterface() {}
ClientLeaderRegInterface(NetworkAddress remote);
@ -236,4 +239,28 @@ struct ProtocolInfoRequest {
}
};
// Returns true if the cluster descriptor may be modified.
struct CheckDescriptorMutableReply {
constexpr static FileIdentifier file_identifier = 7784299;
CheckDescriptorMutableReply() = default;
explicit CheckDescriptorMutableReply(bool isMutable) : isMutable(isMutable) {}
bool isMutable;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, isMutable);
}
};
// Allows client to check if allowed to change the cluster descriptor.
struct CheckDescriptorMutableRequest {
constexpr static FileIdentifier file_identifier = 214729;
ReplyPromise<CheckDescriptorMutableReply> reply;
CheckDescriptorMutableRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reply);
}
};
#endif

View File

@ -31,7 +31,8 @@ void DatabaseConfiguration::resetInternal() {
commitProxyCount = grvProxyCount = resolverCount = desiredTLogCount = tLogWriteAntiQuorum = tLogReplicationFactor =
storageTeamSize = desiredLogRouterCount = -1;
tLogVersion = TLogVersion::DEFAULT;
tLogDataStoreType = storageServerStoreType = KeyValueStoreType::END;
tLogDataStoreType = storageServerStoreType = testingStorageServerStoreType = KeyValueStoreType::END;
desiredTSSCount = 0;
tLogSpillType = TLogSpillType::DEFAULT;
autoCommitProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_COMMIT_PROXIES;
autoGrvProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_GRV_PROXIES;
@ -43,6 +44,7 @@ void DatabaseConfiguration::resetInternal() {
remoteDesiredTLogCount = -1;
remoteTLogReplicationFactor = repopulateRegionAntiQuorum = 0;
backupWorkerEnabled = false;
perpetualStorageWiggleSpeed = 0;
}
void parse(int* i, ValueRef const& v) {
@ -194,9 +196,9 @@ bool DatabaseConfiguration::isValid() const {
getDesiredRemoteLogs() >= 1 && remoteTLogReplicationFactor >= 0 && repopulateRegionAntiQuorum >= 0 &&
repopulateRegionAntiQuorum <= 1 && usableRegions >= 1 && usableRegions <= 2 && regions.size() <= 2 &&
(usableRegions == 1 || regions.size() == 2) && (regions.size() == 0 || regions[0].priority >= 0) &&
(regions.size() == 0 ||
tLogPolicy->info() !=
"dcid^2 x zoneid^2 x 1"))) { // We cannot specify regions with three_datacenter replication
(regions.size() == 0 || tLogPolicy->info() != "dcid^2 x zoneid^2 x 1") &&
// We cannot specify regions with three_datacenter replication
(perpetualStorageWiggleSpeed == 0 || perpetualStorageWiggleSpeed == 1))) {
return false;
}
std::set<Key> dcIds;
@ -298,6 +300,25 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
result["storage_engine"] = "custom";
}
if (desiredTSSCount > 0) {
result["tss_count"] = desiredTSSCount;
if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V1) {
result["tss_storage_engine"] = "ssd-1";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V2) {
result["tss_storage_engine"] = "ssd-2";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_REDWOOD_V1) {
result["tss_storage_engine"] = "ssd-redwood-experimental";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
result["tss_storage_engine"] = "ssd-rocksdb-experimental";
} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY_RADIXTREE) {
result["tss_storage_engine"] = "memory-radixtree-beta";
} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY) {
result["tss_storage_engine"] = "memory-2";
} else {
result["tss_storage_engine"] = "custom";
}
}
result["log_spill"] = (int)tLogSpillType;
if (remoteTLogReplicationFactor == 1) {
@ -352,7 +373,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
}
result["backup_worker_enabled"] = (int32_t)backupWorkerEnabled;
result["perpetual_storage_wiggle"] = perpetualStorageWiggleSpeed;
return result;
}
@ -448,6 +469,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
}
} else if (ck == LiteralStringRef("storage_replicas")) {
parse(&storageTeamSize, value);
} else if (ck == LiteralStringRef("tss_count")) {
parse(&desiredTSSCount, value);
} else if (ck == LiteralStringRef("log_version")) {
parse((&type), value);
type = std::max((int)TLogVersion::MIN_RECRUITABLE, type);
@ -470,6 +493,9 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
} else if (ck == LiteralStringRef("storage_engine")) {
parse((&type), value);
storageServerStoreType = (KeyValueStoreType::StoreType)type;
} else if (ck == LiteralStringRef("tss_storage_engine")) {
parse((&type), value);
testingStorageServerStoreType = (KeyValueStoreType::StoreType)type;
} else if (ck == LiteralStringRef("auto_commit_proxies")) {
parse(&autoCommitProxyCount, value);
} else if (ck == LiteralStringRef("auto_grv_proxies")) {
@ -499,6 +525,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
parse(&repopulateRegionAntiQuorum, value);
} else if (ck == LiteralStringRef("regions")) {
parse(&regions, value);
} else if (ck == LiteralStringRef("perpetual_storage_wiggle")) {
parse(&perpetualStorageWiggleSpeed, value);
} else {
return false;
}

View File

@ -225,6 +225,10 @@ struct DatabaseConfiguration {
int32_t storageTeamSize;
KeyValueStoreType storageServerStoreType;
// Testing StorageServers
int32_t desiredTSSCount;
KeyValueStoreType testingStorageServerStoreType;
// Remote TLogs
int32_t desiredLogRouterCount;
int32_t remoteDesiredTLogCount;
@ -239,6 +243,9 @@ struct DatabaseConfiguration {
int32_t repopulateRegionAntiQuorum;
std::vector<RegionInfo> regions;
// Perpetual Storage Setting
int32_t perpetualStorageWiggleSpeed;
// Excluded servers (no state should be here)
bool isExcludedServer(NetworkAddressList) const;
std::set<AddressExclusion> getExcludedServers() const;

View File

@ -273,6 +273,9 @@ public:
Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile;
AsyncTrigger proxiesChangeTrigger;
Future<Void> monitorProxiesInfoChange;
Future<Void> monitorTssInfoChange;
Future<Void> tssMismatchHandler;
PromiseStream<UID> tssMismatchStream;
Reference<CommitProxyInfo> commitProxies;
Reference<GrvProxyInfo> grvProxies;
bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time.
@ -320,6 +323,11 @@ public:
std::map<UID, StorageServerInfo*> server_interf;
// map from ssid -> tss interface
std::unordered_map<UID, StorageServerInterface> tssMapping;
// map from tssid -> metrics for that tss pair
std::unordered_map<UID, Reference<TSSMetrics>> tssMetrics;
UID dbId;
bool internal; // Only contexts created through the C client and fdbcli are non-internal
@ -419,6 +427,14 @@ public:
static bool debugUseTags;
static const std::vector<std::string> debugTransactionTagChoices;
std::unordered_map<KeyRef, Reference<WatchMetadata>> watchMap;
// Adds or updates the specified (SS, TSS) pair in the TSS mapping (if not already present).
// Requests to the storage server will be duplicated to the TSS.
void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi);
// Removes the storage server and its TSS pair from the TSS mapping (if present).
// Requests to the storage server will no longer be duplicated to its pair TSS.
void removeTssMapping(StorageServerInterface const& ssi);
};
#endif

View File

@ -2705,13 +2705,17 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version));
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
state BackupConfig config(task);
state Future<Optional<bool>> partitionedLog;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
Version startVersion = wait(tr->getReadVersion());
partitionedLog = config.partitionedLogEnabled().get(tr);
state Future<Version> startVersionFuture = tr->getReadVersion();
wait(success(partitionedLog) && success(startVersionFuture));
Params.beginVersion().set(task, startVersion);
Params.beginVersion().set(task, startVersionFuture.get());
break;
} catch (Error& e) {
wait(tr->onError(e));
@ -2721,14 +2725,15 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
// Check if backup worker is enabled
DatabaseConfiguration dbConfig = wait(getDatabaseConfiguration(cx));
state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled;
if (!backupWorkerEnabled) {
if (!backupWorkerEnabled && partitionedLog.get().present() && partitionedLog.get().get()) {
// Change configuration only when we set to use partitioned logs and
// the flag was not set before.
wait(success(changeConfig(cx, "backup_worker_enabled:=1", true)));
backupWorkerEnabled = true;
}
// Set the "backupStartedKey" and wait for all backup worker started
tr->reset();
state BackupConfig config(task);
loop {
state Future<Void> watchFuture;
try {
@ -2738,7 +2743,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
state Future<Optional<Value>> started = tr->get(backupStartedKey);
state Future<Optional<Value>> taskStarted = tr->get(config.allWorkerStarted().key);
state Future<Optional<bool>> partitionedLog = config.partitionedLogEnabled().get(tr);
partitionedLog = config.partitionedLogEnabled().get(tr);
wait(success(started) && success(taskStarted) && success(partitionedLog));
if (!partitionedLog.get().present() || !partitionedLog.get().get()) {

View File

@ -34,16 +34,7 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
GlobalConfig::GlobalConfig() : lastUpdate(0) {}
void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
auto config = new GlobalConfig{};
config->cx = Database(cx);
g_network->setGlobal(INetwork::enGlobalConfig, config);
config->_updater = updater(config, dbInfo);
}
}
GlobalConfig::GlobalConfig(Database& cx) : cx(cx), lastUpdate(0) {}
GlobalConfig& GlobalConfig::globalConfig() {
void* res = g_network->global(INetwork::enGlobalConfig);
@ -77,6 +68,14 @@ Future<Void> GlobalConfig::onInitialized() {
return initialized.getFuture();
}
Future<Void> GlobalConfig::onChange() {
return configChanged.onTrigger();
}
void GlobalConfig::trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn) {
callbacks.emplace(key, std::move(fn));
}
void GlobalConfig::insert(KeyRef key, ValueRef value) {
data.erase(key);
@ -89,6 +88,8 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
any = StringRef(arena, t.getString(0).contents());
} else if (t.getType(0) == Tuple::ElementType::INT) {
any = t.getInt(0);
} else if (t.getType(0) == Tuple::ElementType::BOOL) {
any = t.getBool(0);
} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
any = t.getFloat(0);
} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
@ -97,19 +98,26 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
ASSERT(false);
}
data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
if (callbacks.find(stableKey) != callbacks.end()) {
callbacks[stableKey](data[stableKey]->value);
}
} catch (Error& e) {
TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
TraceEvent(SevWarn, "GlobalConfigTupleParseError").detail("What", e.what());
}
}
void GlobalConfig::erase(KeyRef key) {
data.erase(key);
void GlobalConfig::erase(Key key) {
erase(KeyRangeRef(key, keyAfter(key)));
}
void GlobalConfig::erase(KeyRangeRef range) {
auto it = data.begin();
while (it != data.end()) {
if (range.contains(it->first)) {
if (callbacks.find(it->first) != callbacks.end()) {
callbacks[it->first](std::nullopt);
}
it = data.erase(it);
} else {
++it;
@ -134,36 +142,39 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
loop {
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
// The value doesn't matter too much, as long as the key is set.
tr->set(migratedKey.contents(), "1"_sr);
if (sampleRate.present()) {
const double sampleRateDbl =
BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
Tuple rate = Tuple().appendDouble(sampleRateDbl);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
}
if (sizeLimit.present()) {
const int64_t sizeLimitInt =
BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
Tuple size = Tuple().append(sizeLimitInt);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
}
wait(tr->commit());
return Void();
} catch (Error& e) {
throw;
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
// The value doesn't matter too much, as long as the key is set.
tr->set(migratedKey.contents(), "1"_sr);
if (sampleRate.present()) {
const double sampleRateDbl =
BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
Tuple rate = Tuple().appendDouble(sampleRateDbl);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
}
if (sizeLimit.present()) {
const int64_t sizeLimitInt =
BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
Tuple size = Tuple().append(sizeLimitInt);
tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
}
wait(tr->commit());
} catch (Error& e) {
// If multiple fdbserver processes are started at once, they will all
// attempt this migration at the same time, sometimes resulting in
// aborts due to conflicts. Purposefully avoid retrying, making this
// migration best-effort.
TraceEvent(SevInfo, "GlobalConfigMigrationError").detail("What", e.what());
}
return Void();
}
// Updates local copy of global configuration by reading the entire key-range
// from storage.
ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
self->data.clear();
self->erase(KeyRangeRef(""_sr, "\xff"_sr));
Transaction tr(self->cx);
RangeResult result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
@ -176,7 +187,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
// Applies updates to the local copy of the global configuration when this
// process receives an updated history.
ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo* dbInfo) {
wait(self->cx->onConnected());
wait(self->migrate(self));
wait(self->refresh(self));
@ -184,9 +196,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
loop {
try {
wait(dbInfo->onChange());
wait(self->dbInfoChanged.onTrigger());
auto& history = dbInfo->get().history;
auto& history = dbInfo->history;
if (history.size() == 0) {
continue;
}
@ -196,8 +208,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
// history updates or the protocol version changed, so it
// must re-read the entire configuration range.
wait(self->refresh(self));
if (dbInfo->get().history.size() > 0) {
self->lastUpdate = dbInfo->get().history.back().version;
if (dbInfo->history.size() > 0) {
self->lastUpdate = dbInfo->history.back().version;
}
} else {
// Apply history in order, from lowest version to highest
@ -222,6 +234,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
self->lastUpdate = vh.version;
}
}
self->configChanged.trigger();
} catch (Error& e) {
throw;
}

View File

@ -62,10 +62,28 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
class GlobalConfig : NonCopyable {
public:
// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
// This function should only be called once by each process (however, it is
// idempotent and calling it multiple times will have no effect).
static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
// Creates a GlobalConfig singleton, accessed by calling
// GlobalConfig::globalConfig(). This function requires a database object
// to allow global configuration to run transactions on the database, and
// an AsyncVar object to watch for changes on. The ClientDBInfo pointer
// should point to a ClientDBInfo object which will contain the updated
// global configuration history when the given AsyncVar changes. This
// function should be called whenever the database object changes, in order
// to allow global configuration to run transactions on the latest
// database.
template <class T>
static void create(Database& cx, Reference<AsyncVar<T>> db, const ClientDBInfo* dbInfo) {
if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
auto config = new GlobalConfig{ cx };
g_network->setGlobal(INetwork::enGlobalConfig, config);
config->_updater = updater(config, dbInfo);
// Bind changes in `db` to the `dbInfoChanged` AsyncTrigger.
forward(db, std::addressof(config->dbInfoChanged));
} else {
GlobalConfig* config = reinterpret_cast<GlobalConfig*>(g_network->global(INetwork::enGlobalConfig));
config->cx = cx;
}
}
// Returns a reference to the global GlobalConfig object. Clients should
// call this function whenever they need to read a value out of the global
@ -114,8 +132,18 @@ public:
// been created and is ready.
Future<Void> onInitialized();
// Triggers the returned future when any key-value pair in the global
// configuration changes.
Future<Void> onChange();
// Calls \ref fn when the value associated with \ref key is changed. \ref
// fn is passed the updated value for the key, or an empty optional if the
// key has been cleared. If the value is an allocated object, its memory
// remains in the control of the global configuration.
void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
private:
GlobalConfig();
GlobalConfig(Database& cx);
// The functions below only affect the local copy of the global
// configuration keyspace! To insert or remove values across all nodes you
@ -127,20 +155,23 @@ private:
void insert(KeyRef key, ValueRef value);
// Removes the given key (and associated value) from the local copy of the
// global configuration keyspace.
void erase(KeyRef key);
void erase(Key key);
// Removes the given key range (and associated values) from the local copy
// of the global configuration keyspace.
void erase(KeyRangeRef range);
ACTOR static Future<Void> migrate(GlobalConfig* self);
ACTOR static Future<Void> refresh(GlobalConfig* self);
ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
ACTOR static Future<Void> updater(GlobalConfig* self, const ClientDBInfo* dbInfo);
Database cx;
AsyncTrigger dbInfoChanged;
Future<Void> _updater;
Promise<Void> initialized;
AsyncTrigger configChanged;
std::unordered_map<StringRef, Reference<ConfigValue>> data;
Version lastUpdate;
std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
};
#endif

View File

@ -60,6 +60,13 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
return out;
}
if (mode == "tss") {
// Set temporary marker in config map to mark that this is a tss configuration and not a normal storage/log
// configuration. A bit of a hack but reuses the parsing code nicely.
out[p + "istss"] = "1";
return out;
}
if (mode == "locked") {
// Setting this key is interpreted as an instruction to use the normal version-stamp-based mechanism for locking
// the database.
@ -119,7 +126,7 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
if ((key == "logs" || key == "commit_proxies" || key == "grv_proxies" || key == "resolvers" ||
key == "remote_logs" || key == "log_routers" || key == "usable_regions" ||
key == "repopulate_anti_quorum") &&
key == "repopulate_anti_quorum" || key == "count") &&
isInteger(value)) {
out[p + key] = value;
}
@ -134,6 +141,14 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
BinaryWriter::toValue(regionObj, IncludeVersion(ProtocolVersion::withRegionConfiguration())).toString();
}
if (key == "perpetual_storage_wiggle" && isInteger(value)) {
int ppWiggle = atoi(value.c_str());
if (ppWiggle >= 2 || ppWiggle < 0) {
printf("Error: Only 0 and 1 are valid values of perpetual_storage_wiggle at present.\n");
return out;
}
out[p + key] = value;
}
return out;
}
@ -326,6 +341,35 @@ ConfigurationResult buildConfiguration(std::vector<StringRef> const& modeTokens,
serializeReplicationPolicy(policyWriter, logPolicy);
outConf[p + "log_replication_policy"] = policyWriter.toValue().toString();
}
if (outConf.count(p + "istss")) {
// redo config parameters to be tss config instead of normal config
// save param values from parsing as a normal config
bool isNew = outConf.count(p + "initialized");
Optional<std::string> count;
Optional<std::string> storageEngine;
if (outConf.count(p + "count")) {
count = Optional<std::string>(outConf[p + "count"]);
}
if (outConf.count(p + "storage_engine")) {
storageEngine = Optional<std::string>(outConf[p + "storage_engine"]);
}
// A new tss setup must have count + storage engine. An adjustment must have at least one.
if ((isNew && (!count.present() || !storageEngine.present())) ||
(!isNew && !count.present() && !storageEngine.present())) {
return ConfigurationResult::INCOMPLETE_CONFIGURATION;
}
// clear map and only reset tss parameters
outConf.clear();
if (count.present()) {
outConf[p + "tss_count"] = count.get();
}
if (storageEngine.present()) {
outConf[p + "tss_storage_engine"] = storageEngine.get();
}
}
return ConfigurationResult::SUCCESS;
}
@ -1105,6 +1149,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
leaderServers.reserve(coord.clientLeaderServers.size());
for (int i = 0; i < coord.clientLeaderServers.size(); i++)
leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
@ -1188,14 +1233,20 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
TEST(old.clusterKeyName() != conn.clusterKeyName()); // Quorum change with new name
TEST(old.clusterKeyName() == conn.clusterKeyName()); // Quorum change with unchanged name
vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
state vector<Future<Optional<LeaderInfo>>> leaderServers;
state ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
// check if allowed to modify the cluster descriptor
if (!change->getDesiredClusterKeyName().empty()) {
CheckDescriptorMutableReply mutabilityReply =
wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutableRequest()));
if (!mutabilityReply.isMutable)
return CoordinatorsResult::BAD_DATABASE_STATE;
}
leaderServers.reserve(coord.clientLeaderServers.size());
for (int i = 0; i < coord.clientLeaderServers.size(); i++)
leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
GetLeaderRequest(coord.clusterKey, UID()),
TaskPriority::CoordinationReply));
choose {
when(wait(waitForAll(leaderServers))) {}
when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; }

View File

@ -380,11 +380,14 @@ ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector<NetworkAddres
ClientLeaderRegInterface::ClientLeaderRegInterface(NetworkAddress remote)
: getLeader(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_GETLEADER)),
openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)) {}
openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)),
checkDescriptorMutable(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE)) {}
ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {
getLeader.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination);
openDatabase.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_OPENDATABASE, TaskPriority::Coordination);
checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE,
TaskPriority::Coordination);
}
// Nominee is the worker among all workers that are considered as leader by a coordinator
@ -496,7 +499,8 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<ClusterConn
if (leader.get().first.forward) {
TraceEvent("MonitorLeaderForwarding")
.detail("NewConnStr", leader.get().first.serializedInfo.toString())
.detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()).trackLatest("MonitorLeaderForwarding");
.detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString())
.trackLatest("MonitorLeaderForwarding");
info.intermediateConnFile = makeReference<ClusterConnectionFile>(
connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString()));
return info;

View File

@ -38,6 +38,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/JsonBuilder.h"
#include "fdbclient/KeyBackedTypes.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ManagementAPI.actor.h"
@ -121,6 +122,52 @@ NetworkOptions::NetworkOptions()
static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/");
static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/");
void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) {
auto result = tssMapping.find(ssi.id());
// Update tss endpoint mapping if ss isn't in mapping, or the interface it mapped to changed
if (result == tssMapping.end() ||
result->second.getValue.getEndpoint().token.first() != tssi.getValue.getEndpoint().token.first()) {
Reference<TSSMetrics> metrics;
if (result == tssMapping.end()) {
// new TSS pairing
metrics = makeReference<TSSMetrics>();
tssMetrics[tssi.id()] = metrics;
tssMapping[ssi.id()] = tssi;
} else {
if (result->second.id() == tssi.id()) {
metrics = tssMetrics[tssi.id()];
} else {
TEST(true); // SS now maps to new TSS! This will probably never happen in practice
tssMetrics.erase(result->second.id());
metrics = makeReference<TSSMetrics>();
tssMetrics[tssi.id()] = metrics;
}
result->second = tssi;
}
queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics));
}
}
void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) {
auto result = tssMapping.find(ssi.id());
if (result != tssMapping.end()) {
tssMetrics.erase(ssi.id());
tssMapping.erase(result);
queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first());
}
}
Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx,
StorageServerInterface const& ssi,
LocalityData const& locality) {
@ -133,6 +180,7 @@ Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx
// pointing to. This is technically correct, but is very unnatural. We may want to refactor load
// balance to take an AsyncVar<Reference<Interface>> so that it is notified when the interface
// changes.
it->second->interf = ssi;
} else {
it->second->notifyContextDestroyed();
@ -285,6 +333,13 @@ void delref(DatabaseContext* ptr) {
ptr->delref();
}
void traceTSSErrors(const char* name, UID tssId, const std::unordered_map<int, uint64_t>& errorsByCode) {
TraceEvent ev(name, tssId);
for (auto& it : errorsByCode) {
ev.detail("E" + std::to_string(it.first), it.second);
}
}
ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
state double lastLogged = 0;
loop {
@ -327,6 +382,62 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
for (const auto& it : cx->tssMetrics) {
// TODO could skip this tss if request counter is zero? would potentially complicate elapsed calculation
// though
if (it.second->mismatches.getIntervalDelta()) {
cx->tssMismatchStream.send(it.first);
}
// do error histograms as separate event
if (it.second->ssErrorsByCode.size()) {
traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode);
}
if (it.second->tssErrorsByCode.size()) {
traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode);
}
TraceEvent tssEv("TSSClientMetrics", cx->dbId);
tssEv.detail("TSSID", it.first)
.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
.detail("Internal", cx->internal);
it.second->cc.logToTraceEvent(tssEv);
tssEv.detail("MeanSSGetValueLatency", it.second->SSgetValueLatency.mean())
.detail("MedianSSGetValueLatency", it.second->SSgetValueLatency.median())
.detail("SSGetValueLatency90", it.second->SSgetValueLatency.percentile(0.90))
.detail("SSGetValueLatency99", it.second->SSgetValueLatency.percentile(0.99));
tssEv.detail("MeanTSSGetValueLatency", it.second->TSSgetValueLatency.mean())
.detail("MedianTSSGetValueLatency", it.second->TSSgetValueLatency.median())
.detail("TSSGetValueLatency90", it.second->TSSgetValueLatency.percentile(0.90))
.detail("TSSGetValueLatency99", it.second->TSSgetValueLatency.percentile(0.99));
tssEv.detail("MeanSSGetKeyLatency", it.second->SSgetKeyLatency.mean())
.detail("MedianSSGetKeyLatency", it.second->SSgetKeyLatency.median())
.detail("SSGetKeyLatency90", it.second->SSgetKeyLatency.percentile(0.90))
.detail("SSGetKeyLatency99", it.second->SSgetKeyLatency.percentile(0.99));
tssEv.detail("MeanTSSGetKeyLatency", it.second->TSSgetKeyLatency.mean())
.detail("MedianTSSGetKeyLatency", it.second->TSSgetKeyLatency.median())
.detail("TSSGetKeyLatency90", it.second->TSSgetKeyLatency.percentile(0.90))
.detail("TSSGetKeyLatency99", it.second->TSSgetKeyLatency.percentile(0.99));
tssEv.detail("MeanSSGetKeyValuesLatency", it.second->SSgetKeyLatency.mean())
.detail("MedianSSGetKeyValuesLatency", it.second->SSgetKeyLatency.median())
.detail("SSGetKeyValuesLatency90", it.second->SSgetKeyLatency.percentile(0.90))
.detail("SSGetKeyValuesLatency99", it.second->SSgetKeyLatency.percentile(0.99));
tssEv.detail("MeanTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.mean())
.detail("MedianTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.median())
.detail("TSSGetKeyValuesLatency90", it.second->TSSgetKeyValuesLatency.percentile(0.90))
.detail("TSSGetKeyValuesLatency99", it.second->TSSgetKeyValuesLatency.percentile(0.99));
it.second->clear();
}
lastLogged = now();
}
}
@ -711,6 +822,59 @@ ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
}
}
ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
state Reference<ReadYourWritesTransaction> tr;
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
loop {
state UID tssID = waitNext(cx->tssMismatchStream.getFuture());
// find ss pair id so we can remove it from the mapping
state UID tssPairID;
bool found = false;
for (const auto& it : cx->tssMapping) {
if (it.second.id() == tssID) {
tssPairID = it.first;
found = true;
break;
}
}
if (found) {
TraceEvent(SevWarnAlways, "TSS_KillMismatch").detail("TSSID", tssID.toString());
TEST(true); // killing TSS because it got mismatch
// TODO we could write something to the system keyspace and then have DD listen to that keyspace and then DD
// do exactly this, so why not just cut out the middle man (or the middle system keys, as it were)
tr = makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(cx)));
state int tries = 0;
loop {
try {
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->clear(serverTagKeyFor(tssID));
tssMapDB.erase(tr, tssPairID);
wait(tr->commit());
break;
} catch (Error& e) {
wait(tr->onError(e));
}
tries++;
if (tries > 10) {
// Give up on trying to kill the tss, it'll get another mismatch or a human will investigate
// eventually
TraceEvent("TSS_KillMismatchGaveUp").detail("TSSID", tssID.toString());
break;
}
}
// clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx
tr = makeReference<ReadYourWritesTransaction>();
} else {
TEST(true); // Not killing TSS with mismatch because it's already gone
}
}
}
ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext* cx, bool detailed) {
if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) {
if (detailed) {
@ -957,9 +1121,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
GlobalConfig::create(this, clientInfo);
monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
tssMismatchHandler = handleTssMismatches(this);
clientStatusUpdater.actor = clientStatusUpdateActor(this);
cacheListMonitor = monitorCacheList(this);
@ -1051,14 +1214,16 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<ClientProfilingImpl>(
KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
SpecialKeySpace::MODULE::MANAGEMENT,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<MaintenanceImpl>(
KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
SpecialKeySpace::MODULE::MANAGEMENT,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<DataDistributionImpl>(
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
@ -1199,6 +1364,8 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
DatabaseContext::~DatabaseContext() {
cacheListMonitor.cancel();
monitorProxiesInfoChange.cancel();
monitorTssInfoChange.cancel();
tssMismatchHandler.cancel();
for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it))
it->second->notifyContextDestroyed();
ASSERT_ABORT(server_interf.empty());
@ -1553,7 +1720,9 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
/*switchable*/ true);
}
return Database(db);
auto database = Database(db);
GlobalConfig::create(database, clientInfo, std::addressof(clientInfo->get()));
return database;
}
Database Database::createDatabase(std::string connFileName,
@ -2015,6 +2184,29 @@ ACTOR Future<Optional<vector<StorageServerInterface>>> transactionalGetServerInt
return serverInterfaces;
}
void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) {
// Since a ss -> tss mapping is included in resultsTssMapping iff that SS is in results and has a tss pair,
// all SS in results that do not have a mapping present must not have a tss pair.
std::unordered_map<UID, const StorageServerInterface*> ssiById;
for (const auto& [_, shard] : reply.results) {
for (auto& ssi : shard) {
ssiById[ssi.id()] = &ssi;
}
}
for (const auto& mapping : reply.resultsTssMapping) {
auto ssi = ssiById.find(mapping.first);
ASSERT(ssi != ssiById.end());
cx->addTssMapping(*ssi->second, mapping.second);
ssiById.erase(mapping.first);
}
// if SS didn't have a mapping above, it's still in the ssiById map, so remove its tss mapping
for (const auto& it : ssiById) {
cx->removeTssMapping(*it.second);
}
}
// If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
// Otherwise returns the shard containing key
ACTOR Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation_internal(Database cx,
@ -2047,6 +2239,7 @@ ACTOR Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation_internal(Da
ASSERT(rep.results.size() == 1);
auto locationInfo = cx->setCachedLocation(rep.results[0].first, rep.results[0].second);
updateTssMappings(cx, rep);
return std::make_pair(KeyRange(rep.results[0].first, rep.arena), locationInfo);
}
}
@ -2110,6 +2303,7 @@ ACTOR Future<vector<pair<KeyRange, Reference<LocationInfo>>>> getKeyRangeLocatio
cx->setCachedLocation(rep.results[shard].first, rep.results[shard].second));
wait(yield());
}
updateTssMappings(cx, rep);
return results;
}
@ -2235,7 +2429,7 @@ ACTOR Future<Optional<Value>> getValue(Future<Version> version,
state GetValueReply reply;
try {
if (CLIENT_BUGGIFY) {
if (CLIENT_BUGGIFY_WITH_PROB(.01)) {
throw deterministicRandom()->randomChoice(
std::vector<Error>{ transaction_too_old(), future_version() });
}
@ -2345,6 +2539,11 @@ ACTOR Future<Key> getKey(Database cx, KeySelector k, Future<Version> version, Tr
"NativeAPI.getKey.Before"); //.detail("StartKey",
// k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual);
++cx->transactionPhysicalReads;
GetKeyRequest req(
span.context, k, version.get(), cx->sampleReadTags() ? tags : Optional<TagSet>(), getKeyID);
req.arena.dependsOn(k.arena());
state GetKeyReply reply;
try {
choose {
@ -2353,11 +2552,7 @@ ACTOR Future<Key> getKey(Database cx, KeySelector k, Future<Version> version, Tr
wait(loadBalance(cx.getPtr(),
ssi.second,
&StorageServerInterface::getKey,
GetKeyRequest(span.context,
k,
version.get(),
cx->sampleReadTags() ? tags : Optional<TagSet>(),
getKeyID),
req,
TaskPriority::DefaultPromiseEndpoint,
false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
@ -2718,6 +2913,9 @@ ACTOR Future<RangeResult> getExactRange(Database cx,
req.end = firstGreaterOrEqual(range.end);
req.spanContext = span.context;
// keep shard's arena around in case of async tss comparison
req.arena.dependsOn(locations[shard].first.arena());
transformRangeLimits(limits, reverse, req);
ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
@ -3034,6 +3232,9 @@ ACTOR Future<RangeResult> getRange(Database cx,
req.isFetchKeys = (info.taskID == TaskPriority::FetchKeys);
req.version = readVersion;
// In case of async tss comparison, also make req arena depend on begin, end, and/or shard's arena depending
// on which is used
bool dependOnShard = false;
if (reverse && (begin - 1).isDefinitelyLess(shard.begin) &&
(!begin.isFirstGreaterOrEqual() ||
begin.getKey() != shard.begin)) { // In this case we would be setting modifiedSelectors to true, but
@ -3041,14 +3242,23 @@ ACTOR Future<RangeResult> getRange(Database cx,
req.begin = firstGreaterOrEqual(shard.begin);
modifiedSelectors = true;
} else
req.arena.dependsOn(shard.arena());
dependOnShard = true;
} else {
req.begin = begin;
req.arena.dependsOn(begin.arena());
}
if (!reverse && end.isDefinitelyGreater(shard.end)) {
req.end = firstGreaterOrEqual(shard.end);
modifiedSelectors = true;
} else
if (!dependOnShard) {
req.arena.dependsOn(shard.arena());
}
} else {
req.end = end;
req.arena.dependsOn(end.arena());
}
transformRangeLimits(limits, reverse, req);
ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
@ -3078,7 +3288,7 @@ ACTOR Future<RangeResult> getRange(Database cx,
++cx->transactionPhysicalReads;
state GetKeyValuesReply rep;
try {
if (CLIENT_BUGGIFY) {
if (CLIENT_BUGGIFY_WITH_PROB(.01)) {
throw deterministicRandom()->randomChoice(
std::vector<Error>{ transaction_too_old(), future_version() });
}
@ -3133,10 +3343,17 @@ ACTOR Future<RangeResult> getRange(Database cx,
output.readThroughEnd = readThroughEnd;
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) {
// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
// happens in simulation so it's fine
RangeResult copy;
int newSize =
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
for (int i = 0; i < newSize; i++) {
copy.push_back_deep(copy.arena(), output[i]);
}
output = copy;
output.more = true;
output.resize(
output.arena(),
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()));
getRangeFinished(cx,
trLogInfo,
startTime,

View File

@ -144,6 +144,16 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"counter":0,
"roughness":0.0
},
"fetched_versions":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"fetches_from_logs":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"grv_latency_statistics":{
"default":{
"count":0,
@ -421,6 +431,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"seconds" : 1.0,
"versions" : 1000000
},
"active_tss_count":0,
"degraded_processes":0,
"database_available":true,
"database_lock_state": {
@ -648,6 +659,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"data_distribution_disabled_for_rebalance":true,
"data_distribution_disabled":true,
"active_primary_dc":"pv",
"bounce_impact":{
"can_clean_bounce":true,
"reason":""
},
"configuration":{
"log_anti_quorum":0,
"log_replicas":2,
@ -715,6 +730,19 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"memory-2",
"memory-radixtree-beta"
]},
"tss_count":1,
"tss_storage_engine":{
"$enum":[
"ssd",
"ssd-1",
"ssd-2",
"ssd-redwood-experimental",
"ssd-rocksdb-experimental",
"memory",
"memory-1",
"memory-2",
"memory-radixtree-beta"
]},
"coordinators_count":1,
"excluded_servers":[
{
@ -727,7 +755,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"auto_logs":3,
"commit_proxies":5,
"grv_proxies":1,
"backup_worker_enabled":1
"backup_worker_enabled":1,
"perpetual_storage_wiggle":0
},
"data":{
"least_operating_space_bytes_log_server":0,
@ -787,7 +816,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
}
}
],
"least_operating_space_bytes_storage_server":0
"least_operating_space_bytes_storage_server":0,
"max_machine_failures_without_losing_data":0
},
"machines":{
"$map":{

View File

@ -1384,6 +1384,9 @@ Future<RangeResult> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, K
} else if (config->value.type() == typeid(int64_t)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
} else if (config->value.type() == typeid(bool)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
} else if (config->value.type() == typeid(float)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
@ -2058,9 +2061,20 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
try {
int mode = boost::lexical_cast<int>(iter->value().second.get().toString());
Value modeVal = BinaryWriter::toValue(mode, Unversioned());
if (mode == 0 || mode == 1)
if (mode == 0 || mode == 1) {
// Whenever configuration changes or DD related system keyspace is changed,
// actor must grab the moveKeysLockOwnerKey and update moveKeysLockWriteKey.
// This prevents concurrent write to the same system keyspace.
// When the owner of the DD related system keyspace changes, DD will reboot
BinaryWriter wrMyOwner(Unversioned());
wrMyOwner << dataDistributionModeLock;
ryw->getTransaction().set(moveKeysLockOwnerKey, wrMyOwner.toValue());
BinaryWriter wrLastWrite(Unversioned());
wrLastWrite << deterministicRandom()->randomUniqueID();
ryw->getTransaction().set(moveKeysLockWriteKey, wrLastWrite.toValue());
// set mode
ryw->getTransaction().set(dataDistributionModeKey, modeVal);
else
} else
msg = ManagementAPIError::toJsonString(false,
"datadistribution",
"Please set the value of the data_distribution/mode to "

View File

@ -0,0 +1,385 @@
/*
* StorageServerInterface.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/StorageServerInterface.h"
#include "flow/crc32c.h" // for crc32c_append, to checksum values in tss trace events
// Includes template specializations for all tss operations on storage server types.
// New StorageServerInterface reply types must be added here or it won't compile.
// if size + hex of checksum is shorter than value, record that instead of actual value. break-even point is 12
// characters
std::string traceChecksumValue(ValueRef s) {
return s.size() > 12 ? format("(%d)%08x", s.size(), crc32c_append(0, s.begin(), s.size())) : s.toString();
}
template <>
bool TSS_doCompare(const GetValueRequest& req,
const GetValueReply& src,
const GetValueReply& tss,
Severity traceSeverity,
UID tssId) {
if (src.value.present() != tss.value.present() || (src.value.present() && src.value.get() != tss.value.get())) {
TraceEvent(traceSeverity, "TSSMismatchGetValue")
.suppressFor(1.0)
.detail("TSSID", tssId)
.detail("Key", req.key.printable())
.detail("Version", req.version)
.detail("SSReply", src.value.present() ? traceChecksumValue(src.value.get()) : "missing")
.detail("TSSReply", tss.value.present() ? traceChecksumValue(tss.value.get()) : "missing");
return false;
}
return true;
}
template <>
bool TSS_doCompare(const GetKeyRequest& req,
const GetKeyReply& src,
const GetKeyReply& tss,
Severity traceSeverity,
UID tssId) {
// This process is a bit complicated. Since the tss and ss can return different results if neighboring shards to
// req.sel.key are currently being moved, We validate that the results are the same IF the returned key selectors
// are final. Otherwise, we only mark the request as a mismatch if the difference between the two returned key
// selectors could ONLY be because of different results from the storage engines. We can afford to only partially
// check key selectors that start in a TSS shard and end in a non-TSS shard because the other read queries and the
// consistency check will eventually catch a misbehaving storage engine.
bool matches = true;
if (src.sel.orEqual == tss.sel.orEqual && src.sel.offset == tss.sel.offset) {
// full matching case
if (src.sel.offset == 0 && src.sel.orEqual) {
// found exact key, should be identical
matches = src.sel.getKey() == tss.sel.getKey();
}
// if the query doesn't return the final key, there is an edge case where the ss and tss have different shard
// boundaries, so they pass different shard boundary keys back for the same offset
} else if (src.sel.getKey() == tss.sel.getKey()) {
// There is one case with a positive offset where the shard boundary the incomplete query stopped at is the next
// key in the shard that the complete query returned. This is not possible with a negative offset because the
// shard boundary is exclusive backwards
if (src.sel.offset == 0 && src.sel.orEqual && tss.sel.offset == 1 && !tss.sel.orEqual) {
// case where ss was complete and tss was incomplete
} else if (tss.sel.offset == 0 && tss.sel.orEqual && src.sel.offset == 1 && !src.sel.orEqual) {
// case where tss was complete and ss was incomplete
} else {
matches = false;
}
} else {
// ss/tss returned different keys, and different offsets and/or orEqual
// here we just validate that ordering of the keys matches the ordering of the offsets
bool tssKeyLarger = src.sel.getKey() < tss.sel.getKey();
// the only case offsets are equal and orEqual aren't equal is the case with a negative offset,
// where one response has <=0 with the actual result and the other has <0 with the shard upper boundary.
// So whichever one has the actual result should have the lower key.
bool tssOffsetLarger = (src.sel.offset == tss.sel.offset) ? tss.sel.orEqual : src.sel.offset < tss.sel.offset;
matches = tssKeyLarger != tssOffsetLarger;
}
if (!matches) {
TraceEvent(traceSeverity, "TSSMismatchGetKey")
.suppressFor(1.0)
.detail("TSSID", tssId)
.detail("KeySelector",
format("%s%s:%d", req.sel.orEqual ? "=" : "", req.sel.getKey().printable().c_str(), req.sel.offset))
.detail("Version", req.version)
.detail("SSReply",
format("%s%s:%d", src.sel.orEqual ? "=" : "", src.sel.getKey().printable().c_str(), src.sel.offset))
.detail(
"TSSReply",
format("%s%s:%d", tss.sel.orEqual ? "=" : "", tss.sel.getKey().printable().c_str(), tss.sel.offset));
}
return matches;
}
template <>
bool TSS_doCompare(const GetKeyValuesRequest& req,
const GetKeyValuesReply& src,
const GetKeyValuesReply& tss,
Severity traceSeverity,
UID tssId) {
if (src.more != tss.more || src.data != tss.data) {
std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : "");
for (auto& it : src.data) {
ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
}
std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : "");
for (auto& it : tss.data) {
tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
}
TraceEvent(traceSeverity, "TSSMismatchGetKeyValues")
.suppressFor(1.0)
.detail("TSSID", tssId)
.detail(
"Begin",
format(
"%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset))
.detail("End",
format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset))
.detail("Version", req.version)
.detail("Limit", req.limit)
.detail("LimitBytes", req.limitBytes)
.detail("SSReply", ssResultsString)
.detail("TSSReply", tssResultsString);
return false;
}
return true;
}
template <>
bool TSS_doCompare(const WatchValueRequest& req,
const WatchValueReply& src,
const WatchValueReply& tss,
Severity traceSeverity,
UID tssId) {
// We duplicate watches just for load, no need to validte replies.
return true;
}
// no-op template specializations for metrics replies
template <>
bool TSS_doCompare(const WaitMetricsRequest& req,
const StorageMetrics& src,
const StorageMetrics& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
template <>
bool TSS_doCompare(const SplitMetricsRequest& req,
const SplitMetricsReply& src,
const SplitMetricsReply& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
template <>
bool TSS_doCompare(const ReadHotSubRangeRequest& req,
const ReadHotSubRangeReply& src,
const ReadHotSubRangeReply& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
template <>
bool TSS_doCompare(const SplitRangeRequest& req,
const SplitRangeReply& src,
const SplitRangeReply& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
// only record metrics for data reads
template <>
void TSSMetrics::recordLatency(const GetValueRequest& req, double ssLatency, double tssLatency) {
SSgetValueLatency.addSample(ssLatency);
TSSgetValueLatency.addSample(tssLatency);
}
template <>
void TSSMetrics::recordLatency(const GetKeyRequest& req, double ssLatency, double tssLatency) {
SSgetKeyLatency.addSample(ssLatency);
TSSgetKeyLatency.addSample(tssLatency);
}
template <>
void TSSMetrics::recordLatency(const GetKeyValuesRequest& req, double ssLatency, double tssLatency) {
SSgetKeyValuesLatency.addSample(ssLatency);
TSSgetKeyValuesLatency.addSample(tssLatency);
}
template <>
void TSSMetrics::recordLatency(const WatchValueRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const WaitMetricsRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const SplitMetricsRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const ReadHotSubRangeRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const SplitRangeRequest& req, double ssLatency, double tssLatency) {}
// -------------------
TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
printf("testing tss comparisons\n");
// to avoid compiler issues that StringRef(char* is deprecated)
std::string s_a = "a";
std::string s_b = "b";
std::string s_c = "c";
std::string s_d = "d";
std::string s_e = "e";
// test getValue
GetValueRequest gvReq;
gvReq.key = StringRef(s_a);
gvReq.version = 5;
UID tssId;
GetValueReply gvReplyMissing;
GetValueReply gvReplyA(Optional<Value>(StringRef(s_a)), false);
GetValueReply gvReplyB(Optional<Value>(StringRef(s_b)), false);
ASSERT(TSS_doCompare(gvReq, gvReplyMissing, gvReplyMissing, SevInfo, tssId));
ASSERT(TSS_doCompare(gvReq, gvReplyA, gvReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gvReq, gvReplyB, gvReplyB, SevInfo, tssId));
ASSERT(!TSS_doCompare(gvReq, gvReplyMissing, gvReplyA, SevInfo, tssId));
ASSERT(!TSS_doCompare(gvReq, gvReplyA, gvReplyB, SevInfo, tssId));
// test GetKeyValues
Arena a; // for all of the refs. ASAN complains if this isn't done. Could also make them all standalone i guess
GetKeyValuesRequest gkvReq;
gkvReq.begin = firstGreaterOrEqual(StringRef(a, s_a));
gkvReq.end = firstGreaterOrEqual(StringRef(a, s_b));
gkvReq.version = 5;
gkvReq.limit = 100;
gkvReq.limitBytes = 1000;
GetKeyValuesReply gkvReplyEmpty;
GetKeyValuesReply gkvReplyOne;
KeyValueRef v;
v.key = StringRef(a, s_a);
v.value = StringRef(a, s_b);
gkvReplyOne.data.push_back_deep(gkvReplyOne.arena, v);
GetKeyValuesReply gkvReplyOneMore;
gkvReplyOneMore.data.push_back_deep(gkvReplyOneMore.arena, v);
gkvReplyOneMore.more = true;
ASSERT(TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyEmpty, SevInfo, tssId));
ASSERT(TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOne, SevInfo, tssId));
ASSERT(TSS_doCompare(gkvReq, gkvReplyOneMore, gkvReplyOneMore, SevInfo, tssId));
ASSERT(!TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyOne, SevInfo, tssId));
ASSERT(!TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOneMore, SevInfo, tssId));
// test GetKey
GetKeyRequest gkReq;
gkReq.sel = KeySelectorRef(StringRef(a, s_a), false, 1);
gkReq.version = 5;
GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, s_a), false, 20), false);
GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, s_b), false, 10), false);
GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, s_c), true, 0), false);
GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, s_d), false, -10), false);
GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, s_e), false, -20), false);
// identical cases
ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyB, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyD, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyE, SevInfo, tssId));
// relative offset cases
ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyB, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyB, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyD, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyE, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyE, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyD, SevInfo, tssId));
// test same offset/orEqual wrong key
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
SevInfo,
tssId));
// this could be from different shard boundaries, so don't say it's a mismatch
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 10), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false),
SevInfo,
tssId));
// test offsets and key difference don't match
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false),
SevInfo,
tssId));
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, -10), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false),
SevInfo,
tssId));
// test key is next over in one shard, one found it and other didn't
// positive
// one that didn't find is +1
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
SevInfo,
tssId));
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 1), false),
SevInfo,
tssId));
// negative will have zero offset but not equal set
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false),
SevInfo,
tssId));
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
SevInfo,
tssId));
// test shard boundary key returned by incomplete query is the same as the key found by the other (only possible in
// positive direction)
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false),
SevInfo,
tssId));
// explictly test checksum function
std::string s12 = "ABCDEFGHIJKL";
std::string s13 = "ABCDEFGHIJKLO";
std::string checksumStart13 = "(13)";
ASSERT(s_a == traceChecksumValue(StringRef(s_a)));
ASSERT(s12 == traceChecksumValue(StringRef(s12)));
ASSERT(checksumStart13 == traceChecksumValue(StringRef(s13)).substr(0, 4));
return Void();
}

View File

@ -29,7 +29,9 @@
#include "fdbrpc/LoadBalance.actor.h"
#include "fdbrpc/Stats.h"
#include "fdbrpc/TimedRequest.h"
#include "fdbrpc/TSSComparison.h"
#include "fdbclient/TagThrottle.h"
#include "flow/UnitTest.h"
// Dead code, removed in the next protocol version
struct VersionReply {
@ -54,6 +56,7 @@ struct StorageServerInterface {
LocalityData locality;
UID uniqueID;
Optional<UID> tssPairID;
RequestStream<struct GetValueRequest> getValue;
RequestStream<struct GetKeyRequest> getKey;
@ -80,6 +83,7 @@ struct StorageServerInterface {
NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
UID id() const { return uniqueID; }
bool isTss() const { return tssPairID.present(); }
std::string toString() const { return id().shortString(); }
template <class Ar>
void serialize(Ar& ar) {
@ -88,7 +92,11 @@ struct StorageServerInterface {
// considered
if (ar.protocolVersion().hasSmallEndpoints()) {
serializer(ar, uniqueID, locality, getValue);
if (ar.protocolVersion().hasTSS()) {
serializer(ar, uniqueID, locality, getValue, tssPairID);
} else {
serializer(ar, uniqueID, locality, getValue);
}
if (Ar::isDeserializing) {
getKey = RequestStream<struct GetKeyRequest>(getValue.getEndpoint().getAdjustedEndpoint(1));
getKeyValues = RequestStream<struct GetKeyValuesRequest>(getValue.getEndpoint().getAdjustedEndpoint(2));
@ -127,8 +135,9 @@ struct StorageServerInterface {
waitFailure,
getQueuingMetrics,
getKeyValueStoreType);
if (ar.protocolVersion().hasWatches())
if (ar.protocolVersion().hasWatches()) {
serializer(ar, watchValue);
}
}
}
bool operator==(StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }

View File

@ -25,6 +25,7 @@
#include "flow/Arena.h"
#include "flow/TDMetric.actor.h"
#include "flow/serialize.h"
#include "flow/UnitTest.h"
const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@ -345,7 +346,10 @@ uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key) {
return idx;
}
const KeyRangeRef tssMappingKeys(LiteralStringRef("\xff/tss/"), LiteralStringRef("\xff/tss0"));
const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0"));
const KeyRef serverTagPrefix = serverTagKeys.begin;
const KeyRangeRef serverTagConflictKeys(LiteralStringRef("\xff/serverTagConflict/"),
LiteralStringRef("\xff/serverTagConflict0"));
@ -532,6 +536,7 @@ const Key serverListKeyFor(UID serverID) {
return wr.toValue();
}
// TODO use flatbuffers depending on version
const Value serverListValue(StorageServerInterface const& server) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withServerListValue()));
wr << server;
@ -550,6 +555,17 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) {
return s;
}
const Value serverListValueFB(StorageServerInterface const& server) {
return ObjectWriter::toValue(server, IncludeVersion());
}
StorageServerInterface decodeServerListValueFB(ValueRef const& value) {
StorageServerInterface s;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(s);
return s;
}
// processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0'
const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0"));
const KeyRef processClassPrefix = processClassKeys.begin;
@ -594,6 +610,9 @@ ProcessClass decodeProcessClassValue(ValueRef const& value) {
const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\xff/conf0"));
const KeyRef configKeysPrefix = configKeys.begin;
const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle"));
const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWigglePID"));
const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint"));
const KeyRangeRef excludedServersKeys(LiteralStringRef("\xff/conf/excluded/"), LiteralStringRef("\xff/conf/excluded0"));
@ -633,15 +652,17 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
// const KeyRef globalConfigPrefix = globalConfigKeys.begin;
const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
const KeyRangeRef globalConfigDataKeys(LiteralStringRef("\xff/globalConfig/k/"),
LiteralStringRef("\xff/globalConfig/k0"));
const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin;
const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
const KeyRangeRef globalConfigHistoryKeys(LiteralStringRef("\xff/globalConfig/h/"),
LiteralStringRef("\xff/globalConfig/h0"));
const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v");
const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") );
const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0"));
const KeyRef workerListPrefix = workerListKeys.begin;
const Key workerListKeyFor(StringRef processID) {
@ -1082,3 +1103,60 @@ const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTON
const KeyRef writeRecoveryKey = LiteralStringRef("\xff/writeRecovery");
const ValueRef writeRecoveryKeyTrue = LiteralStringRef("1");
const KeyRef snapshotEndVersionKey = LiteralStringRef("\xff/snapshotEndVersion");
// for tests
void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
ssi.id().toString().c_str(),
ssi.locality.toString().c_str(),
ssi.isTss() ? "true" : "false",
ssi.isTss() ? ssi.tssPairID.get().toString().c_str() : "",
ssi.address().toString().c_str(),
ssi.getValue.getEndpoint().token.toString().c_str());
StorageServerInterface ssi2 =
(useFB) ? decodeServerListValueFB(serverListValueFB(ssi)) : decodeServerListValue(serverListValue(ssi));
printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
ssi2.id().toString().c_str(),
ssi2.locality.toString().c_str(),
ssi2.isTss() ? "true" : "false",
ssi2.isTss() ? ssi2.tssPairID.get().toString().c_str() : "",
ssi2.address().toString().c_str(),
ssi2.getValue.getEndpoint().token.toString().c_str());
ASSERT(ssi.id() == ssi2.id());
ASSERT(ssi.locality == ssi2.locality);
ASSERT(ssi.isTss() == ssi2.isTss());
if (ssi.isTss()) {
ASSERT(ssi2.tssPairID.get() == ssi2.tssPairID.get());
}
ASSERT(ssi.address() == ssi2.address());
ASSERT(ssi.getValue.getEndpoint().token == ssi2.getValue.getEndpoint().token);
}
// unit test for serialization since tss stuff had bugs
TEST_CASE("/SystemData/SerDes/SSI") {
printf("testing ssi serdes\n");
LocalityData localityData(Optional<Standalone<StringRef>>(),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Optional<Standalone<StringRef>>());
// non-tss
StorageServerInterface ssi;
ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678);
ssi.locality = localityData;
ssi.initEndpoints();
testSSISerdes(ssi, false);
testSSISerdes(ssi, true);
ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238);
testSSISerdes(ssi, false);
testSSISerdes(ssi, true);
printf("ssi serdes test complete\n");
return Void();
}

View File

@ -115,6 +115,9 @@ extern const KeyRef cacheChangePrefix;
const Key cacheChangeKeyFor(uint16_t idx);
uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key);
// "\xff/tss/[[serverId]]" := "[[tssId]]"
extern const KeyRangeRef tssMappingKeys;
// "\xff/serverTag/[[serverID]]" = "[[Tag]]"
// Provides the Tag for the given serverID. Used to access a
// storage server's corresponding TLog in order to apply mutations.
@ -196,6 +199,8 @@ UID decodeProcessClassKeyOld(KeyRef const& key);
extern const KeyRangeRef configKeys;
extern const KeyRef configKeysPrefix;
extern const KeyRef perpetualStorageWiggleKey;
extern const KeyRef wigglingStorageServerKey;
// Change the value of this key to anything and that will trigger detailed data distribution team info log.
extern const KeyRef triggerDDTeamInfoPrintKey;

View File

@ -71,6 +71,8 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
i += sizeof(float) + 1;
} else if (data[i] == 0x21) {
i += sizeof(double) + 1;
} else if (data[i] == 0x26 || data[i] == 0x27) {
i += 1;
} else if (data[i] == '\x00') {
i += 1;
} else {
@ -144,6 +146,16 @@ Tuple& Tuple::append(int64_t value) {
return *this;
}
Tuple& Tuple::appendBool(bool value) {
offsets.push_back(data.size());
if (value) {
data.push_back(data.arena(), 0x27);
} else {
data.push_back(data.arena(), 0x26);
}
return *this;
}
Tuple& Tuple::appendFloat(float value) {
offsets.push_back(data.size());
float swap = bigEndianFloat(value);
@ -192,6 +204,8 @@ Tuple::ElementType Tuple::getType(size_t index) const {
return ElementType::FLOAT;
} else if (code == 0x21) {
return ElementType::DOUBLE;
} else if (code == 0x26 || code == 0x27) {
return ElementType::BOOL;
} else {
throw invalid_tuple_data_type();
}
@ -287,6 +301,21 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
}
// TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
bool Tuple::getBool(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();
}
ASSERT_LT(offsets[index], data.size());
uint8_t code = data[offsets[index]];
if (code == 0x26) {
return false;
} else if (code == 0x27) {
return true;
} else {
throw invalid_tuple_data_type();
}
}
float Tuple::getFloat(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();

View File

@ -40,6 +40,7 @@ struct Tuple {
Tuple& append(int64_t);
// There are some ambiguous append calls in fdbclient, so to make it easier
// to add append for floats and doubles, name them differently for now.
Tuple& appendBool(bool);
Tuple& appendFloat(float);
Tuple& appendDouble(double);
Tuple& appendNull();
@ -51,7 +52,7 @@ struct Tuple {
return append(t);
}
enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE };
// this is number of elements, not length of data
size_t size() const { return offsets.size(); }
@ -59,6 +60,7 @@ struct Tuple {
ElementType getType(size_t index) const;
Standalone<StringRef> getString(size_t index) const;
int64_t getInt(size_t index, bool allow_incomplete = false) const;
bool getBool(size_t index) const;
float getFloat(size_t index) const;
double getDouble(size_t index) const;

View File

@ -46,7 +46,8 @@ EvictablePage::~EvictablePage() {
}
}
std::map<std::string, OpenFileInfo> AsyncFileCached::openFiles;
// A map of filename to the file handle for all opened cached files
std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> AsyncFileCached::openFiles;
void AsyncFileCached::remove_page(AFCPage* page) {
pages.erase(page->pageOffset);

View File

@ -132,39 +132,32 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
const CacheEvictionType cacheEvictionType;
};
struct OpenFileInfo : NonCopyable {
IAsyncFile* f;
Future<Reference<IAsyncFile>> opened; // Only valid until the file is fully opened
OpenFileInfo() : f(0) {}
OpenFileInfo(OpenFileInfo&& r) noexcept : f(r.f), opened(std::move(r.opened)) { r.f = 0; }
Future<Reference<IAsyncFile>> get() {
if (f)
return Reference<IAsyncFile>::addRef(f);
else
return opened;
}
};
struct AFCPage;
class AsyncFileCached final : public IAsyncFile, public ReferenceCounted<AsyncFileCached> {
friend struct AFCPage;
public:
// Opens a file that uses the FDB in-memory page cache
static Future<Reference<IAsyncFile>> open(std::string filename, int flags, int mode) {
//TraceEvent("AsyncFileCachedOpen").detail("Filename", filename);
if (openFiles.find(filename) == openFiles.end()) {
auto itr = openFiles.find(filename);
if (itr == openFiles.end()) {
auto f = open_impl(filename, flags, mode);
if (f.isReady() && f.isError())
return f;
if (!f.isReady())
openFiles[filename].opened = f;
else
return f.get();
auto result = openFiles.try_emplace(filename, f);
// This should be inserting a new entry
ASSERT(result.second);
itr = result.first;
// We return here instead of falling through to the outer scope so that we don't delete all references to
// the underlying file before returning
return itr->second.get();
}
return openFiles[filename].get();
return itr->second.get();
}
Future<int> read(void* data, int length, int64_t offset) override {
@ -263,7 +256,9 @@ public:
~AsyncFileCached() override;
private:
static std::map<std::string, OpenFileInfo> openFiles;
// A map of filename to the file handle for all opened cached files
static std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
std::string filename;
Reference<IAsyncFile> uncached;
int64_t length;
@ -330,6 +325,7 @@ private:
static Future<Reference<IAsyncFile>> open_impl(std::string filename, int flags, int mode);
// Opens a file that uses the FDB in-memory page cache
ACTOR static Future<Reference<IAsyncFile>> open_impl(std::string filename,
int flags,
int mode,
@ -345,10 +341,7 @@ private:
TraceEvent("AFCUnderlyingOpenEnd").detail("Filename", filename);
int64_t l = wait(f->size());
TraceEvent("AFCUnderlyingSize").detail("Filename", filename).detail("Size", l);
auto& of = openFiles[filename];
of.f = new AsyncFileCached(f, filename, l, pageCache);
of.opened = Future<Reference<IAsyncFile>>();
return Reference<IAsyncFile>(of.f);
return new AsyncFileCached(f, filename, l, pageCache);
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
openFiles.erase(filename);

View File

@ -130,6 +130,9 @@ public:
UID id;
std::string filename;
// For files that use atomic write and create, they are initially created with an extra suffix
std::string initialFilename;
// An approximation of the size of the file; .size() should be used instead of this variable in most cases
mutable int64_t approximateSize;
@ -182,11 +185,13 @@ private:
reponses; // cannot call getResult on this actor collection, since the actors will be on different processes
AsyncFileNonDurable(const std::string& filename,
const std::string& initialFilename,
Reference<IAsyncFile> file,
Reference<DiskParameters> diskParameters,
NetworkAddress openedAddress,
bool aio)
: openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false),
: filename(filename), initialFilename(initialFilename), file(file), diskParameters(diskParameters),
openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false),
aio(aio) {
// This is only designed to work in simulation
@ -194,9 +199,6 @@ private:
this->id = deterministicRandom()->randomUniqueID();
//TraceEvent("AsyncFileNonDurable_Create", id).detail("Filename", filename);
this->file = file;
this->filename = filename;
this->diskParameters = diskParameters;
maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
hasBeenSynced = false;
@ -236,10 +238,11 @@ public:
//TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename);
if (shutdown.isReady())
throw io_error().asInjectedFault();
wait(g_simulator.onProcess(currentProcess, currentTaskID));
}
state Reference<AsyncFileNonDurable> nonDurableFile(
new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio));
new AsyncFileNonDurable(filename, actualFilename, file, diskParameters, currentProcess->address, aio));
// Causes the approximateSize member to be set
state Future<int64_t> sizeFuture = nonDurableFile->size();
@ -269,13 +272,38 @@ public:
}
void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
void delref() override {
if (delref_no_destroy()) {
ASSERT(filesBeingDeleted.count(filename) == 0);
//TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename);
Future<Void> deleteFuture = deleteFile(this);
if (!deleteFuture.isReady())
filesBeingDeleted[filename] = deleteFuture;
if (filesBeingDeleted.count(filename) == 0) {
//TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename);
Future<Void> deleteFuture = deleteFile(this);
if (!deleteFuture.isReady())
filesBeingDeleted[filename] = deleteFuture;
}
removeOpenFile(filename, this);
if (initialFilename != filename) {
removeOpenFile(initialFilename, this);
}
}
}
// Removes a file from the openFiles map
static void removeOpenFile(std::string filename, AsyncFileNonDurable* file) {
auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
auto iter = openFiles.find(filename);
// Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already
// be gone. Renamed files (from atomic write and create) will also be present under only one of the two
// names.
if (iter != openFiles.end()) {
// even if the filename exists, it doesn't mean that it references the same file. It could be that the
// file was renamed and later a file with the same name was opened.
if (iter->second.getPtrIfReady().orDefault(nullptr) == file) {
openFiles.erase(iter);
}
}
}
@ -832,11 +860,9 @@ private:
//TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename);
delete self;
wait(g_simulator.onProcess(currentProcess, currentTaskID));
return Void();
} catch (Error& e) {
state Error err = e;
wait(g_simulator.onProcess(currentProcess, currentTaskID));
throw err;
}
}

View File

@ -29,7 +29,8 @@ set(FDBRPC_SRCS
sim2.actor.cpp
sim_validation.cpp
TimedRequest.h
TraceFileIO.cpp)
TraceFileIO.cpp
TSSComparison.h)
set(COMPILE_EIO OFF)

View File

@ -51,6 +51,8 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1);
constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t);
const uint64_t TOKEN_STREAM_FLAG = 1;
const int WLTOKEN_COUNTS = 12; // number of wellKnownEndpoints
class EndpointMap : NonCopyable {
public:
// Reserve space for this many wellKnownEndpoints
@ -96,6 +98,7 @@ void EndpointMap::realloc() {
void EndpointMap::insertWellKnown(NetworkMessageReceiver* r, const Endpoint::Token& token, TaskPriority priority) {
int index = token.second();
ASSERT(index <= WLTOKEN_COUNTS);
ASSERT(data[index].receiver == nullptr);
data[index].receiver = r;
data[index].token() =
@ -334,7 +337,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
}
TransportData::TransportData(uint64_t transportId)
: endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
: endpoints(WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
numIncompatibleConnections(0) {
degraded = makeReference<AsyncVar<bool>>(false);
@ -1215,7 +1218,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
}
compatible = false;
if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
if(peer) {
if (peer) {
peer->protocolVersion->set(protocolVersion);
}

View File

@ -36,6 +36,8 @@
#include "fdbrpc/Locality.h"
#include "fdbrpc/QueueModel.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbrpc/simulator.h" // for checking tss simulation mode
#include "fdbrpc/TSSComparison.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using std::vector;
@ -75,6 +77,97 @@ struct LoadBalancedReply {
Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply);
Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
ACTOR template <class Req, class Resp>
Future<Void> tssComparison(Req req,
Future<ErrorOr<Resp>> fSource,
Future<ErrorOr<Resp>> fTss,
TSSEndpointData tssData) {
state double startTime = now();
state Future<Optional<ErrorOr<Resp>>> fTssWithTimeout = timeout(fTss, FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT);
state int finished = 0;
state double srcEndTime;
state double tssEndTime;
loop {
choose {
when(state ErrorOr<Resp> src = wait(fSource)) {
srcEndTime = now();
fSource = Never();
finished++;
if (finished == 2) {
break;
}
}
when(state Optional<ErrorOr<Resp>> tss = wait(fTssWithTimeout)) {
tssEndTime = now();
fTssWithTimeout = Never();
finished++;
if (finished == 2) {
break;
}
}
}
}
// we want to record ss/tss errors to metrics
int srcErrorCode = error_code_success;
int tssErrorCode = error_code_success;
++tssData.metrics->requests;
if (src.isError()) {
srcErrorCode = src.getError().code();
tssData.metrics->ssError(srcErrorCode);
}
if (!tss.present()) {
++tssData.metrics->tssTimeouts;
} else if (tss.get().isError()) {
tssErrorCode = tss.get().getError().code();
tssData.metrics->tssError(tssErrorCode);
}
if (!src.isError() && tss.present() && !tss.get().isError()) {
Optional<LoadBalancedReply> srcLB = getLoadBalancedReply(&src.get());
Optional<LoadBalancedReply> tssLB = getLoadBalancedReply(&tss.get().get());
ASSERT(srcLB.present() ==
tssLB.present()); // getLoadBalancedReply returned different responses for same templated type
// if Resp is a LoadBalancedReply, only compare if both replies are non-error
if (!srcLB.present() || (!srcLB.get().error.present() && !tssLB.get().error.present())) {
// only record latency difference if both requests actually succeeded, so that we're comparing apples to
// apples
tssData.metrics->recordLatency(req, srcEndTime - startTime, tssEndTime - startTime);
// expect mismatches in drop mutations mode.
Severity traceSeverity =
(g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
? SevWarnAlways
: SevError;
if (!TSS_doCompare(req, src.get(), tss.get().get(), traceSeverity, tssData.tssId)) {
TEST(true); // TSS Mismatch
++tssData.metrics->mismatches;
}
} else if (tssLB.present() && tssLB.get().error.present()) {
tssErrorCode = tssLB.get().error.get().code();
tssData.metrics->tssError(tssErrorCode);
} else if (srcLB.present() && srcLB.get().error.present()) {
srcErrorCode = srcLB.get().error.get().code();
tssData.metrics->ssError(srcErrorCode);
}
}
if (srcErrorCode != error_code_success && tssErrorCode != error_code_success && srcErrorCode != tssErrorCode) {
// if ss and tss both got different errors, record them
TraceEvent("TSSErrorMismatch")
.suppressFor(1.0)
.detail("TSSID", tssData.tssId)
.detail("SSError", srcErrorCode)
.detail("TSSError", tssErrorCode);
}
return Void();
}
// Stores state for a request made by the load balancer
template <class Request>
struct RequestData : NonCopyable {
@ -91,11 +184,30 @@ struct RequestData : NonCopyable {
// This is true once setupRequest is called, even though at that point the response is Never().
bool isValid() { return response.isValid(); }
static void maybeDuplicateTSSRequest(RequestStream<Request> const* stream,
Request& request,
QueueModel* model,
Future<Reply> ssResponse) {
if (model) {
// Send parallel request to TSS pair, if it exists
Optional<TSSEndpointData> tssData = model->getTssData(stream->getEndpoint().token.first());
if (tssData.present()) {
TEST(true); // duplicating request to TSS
resetReply(request);
// FIXME: optimize to avoid creating new netNotifiedQueue for each message
RequestStream<Request> tssRequestStream(tssData.get().endpoint);
Future<ErrorOr<REPLY_TYPE(Request)>> fTssResult = tssRequestStream.tryGetReply(request);
model->addActor.send(tssComparison(request, ssResponse, fTssResult, tssData.get()));
}
}
}
// Initializes the request state and starts it, possibly after a backoff delay
void startRequest(double backoff,
bool triedAllOptions,
RequestStream<Request> const* stream,
Request const& request,
Request& request,
QueueModel* model) {
modelHolder = Reference<ModelHolder>();
requestStarted = false;
@ -105,12 +217,15 @@ struct RequestData : NonCopyable {
delay(backoff), [this, stream, &request, model](Void _) {
requestStarted = true;
modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
return stream->tryGetReply(request);
Future<Reply> resp = stream->tryGetReply(request);
maybeDuplicateTSSRequest(stream, request, model, resp);
return resp;
});
} else {
requestStarted = true;
modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
response = stream->tryGetReply(request);
maybeDuplicateTSSRequest(stream, request, model, response);
}
requestProcessed = false;

View File

@ -60,6 +60,20 @@ double QueueModel::addRequest(uint64_t id) {
return d.penalty;
}
void QueueModel::updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& tssData) {
auto& d = data[endpointId];
d.tssData = tssData;
}
void QueueModel::removeTssEndpoint(uint64_t endpointId) {
auto& d = data[endpointId];
d.tssData = Optional<TSSEndpointData>();
}
Optional<TSSEndpointData> QueueModel::getTssData(uint64_t id) {
return data[id].tssData;
}
Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply) {
return *reply;
}

View File

@ -26,6 +26,17 @@
#include "fdbrpc/Smoother.h"
#include "flow/Knobs.h"
#include "flow/ActorCollection.h"
#include "fdbrpc/TSSComparison.h" // For TSS Metrics
#include "fdbrpc/FlowTransport.h" // For Endpoint
struct TSSEndpointData {
UID tssId;
Endpoint endpoint;
Reference<TSSMetrics> metrics;
TSSEndpointData(UID tssId, Endpoint endpoint, Reference<TSSMetrics> metrics)
: tssId(tssId), endpoint(endpoint), metrics(metrics) {}
};
// The data structure used for the client-side load balancing algorithm to
// decide which storage server to read data from. Conceptually, it tracks the
@ -59,6 +70,10 @@ struct QueueData {
// hasn't returned a valid result, increase above `futureVersionBackoff`
// to increase the future backoff amount.
double increaseBackoffTime;
// a bit of a hack to store this here, but it's the only centralized place for per-endpoint tracking
Optional<TSSEndpointData> tssData;
QueueData()
: latency(0.001), penalty(1.0), smoothOutstanding(FLOW_KNOBS->QUEUE_MODEL_SMOOTHING_AMOUNT), failedUntil(0),
futureVersionBackoff(FLOW_KNOBS->FUTURE_VERSION_INITIAL_BACKOFF), increaseBackoffTime(0) {}
@ -89,13 +104,29 @@ public:
double secondBudget;
PromiseStream<Future<Void>> addActor;
Future<Void> laggingRequests; // requests for which a different recipient already answered
PromiseStream<Future<Void>> addTSSActor;
Future<Void> tssComparisons; // requests for which a different recipient already answered
int laggingRequestCount;
int laggingTSSCompareCount;
// Updates this endpoint data to duplicate requests to the specified TSS endpoint
void updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& endpointData);
// Removes the TSS mapping from this endpoint to stop duplicating requests to a TSS endpoint
void removeTssEndpoint(uint64_t endpointId);
// Retrieves the data for this endpoint's pair TSS endpoint, if present
Optional<TSSEndpointData> getTssData(uint64_t endpointId);
QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0) {
laggingRequests = actorCollection(addActor.getFuture(), &laggingRequestCount);
tssComparisons = actorCollection(addTSSActor.getFuture(), &laggingTSSCompareCount);
}
~QueueModel() { laggingRequests.cancel(); }
~QueueModel() {
laggingRequests.cancel();
tssComparisons.cancel();
}
private:
std::unordered_map<uint64_t, QueueData> data;
@ -121,4 +152,4 @@ private:
};
*/
#endif
#endif

89
fdbrpc/TSSComparison.h Normal file
View File

@ -0,0 +1,89 @@
/*
* TSSComparison.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This header is to declare the tss comparison function that LoadBalance.Actor.h needs to be aware of to call,
* But StorageServerInterface.h needs to implement on the types defined in SSI.h.
*/
#ifndef FDBRPC_TSS_COMPARISON_H
#define FDBRPC_TSS_COMPARISON_H
#include "fdbrpc/ContinuousSample.h"
#include "fdbrpc/Stats.h"
// refcounted + noncopyable because both DatabaseContext and individual endpoints share ownership
struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
CounterCollection cc;
Counter requests;
Counter ssErrors;
Counter tssErrors;
Counter tssTimeouts;
Counter mismatches;
// We could probably just ignore getKey as it's seldom used?
ContinuousSample<double> SSgetValueLatency;
ContinuousSample<double> SSgetKeyLatency;
ContinuousSample<double> SSgetKeyValuesLatency;
ContinuousSample<double> TSSgetValueLatency;
ContinuousSample<double> TSSgetKeyLatency;
ContinuousSample<double> TSSgetKeyValuesLatency;
std::unordered_map<int, uint64_t> ssErrorsByCode;
std::unordered_map<int, uint64_t> tssErrorsByCode;
void ssError(int code) {
++ssErrors;
ssErrorsByCode[code]++;
}
void tssError(int code) {
++tssErrors;
tssErrorsByCode[code]++;
}
template <class Req>
void recordLatency(const Req& req, double ssLatency, double tssLatency);
void clear() {
SSgetValueLatency.clear();
SSgetKeyLatency.clear();
SSgetKeyValuesLatency.clear();
TSSgetValueLatency.clear();
TSSgetKeyLatency.clear();
TSSgetKeyValuesLatency.clear();
tssErrorsByCode.clear();
ssErrorsByCode.clear();
}
TSSMetrics()
: cc("TSSClientMetrics"), requests("Requests", cc), ssErrors("SSErrors", cc), tssErrors("TSSErrors", cc),
tssTimeouts("TSSTimeouts", cc), mismatches("Mismatches", cc), SSgetValueLatency(1000), SSgetKeyLatency(1000),
SSgetKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), TSSgetKeyValuesLatency(1000) {}
};
// part of the contract of this function is that if there is a mismatch, the implementation needs to record a trace
// event with the specified severity and tssId in the event.
template <class Req, class Rep>
bool TSS_doCompare(const Req& req, const Rep& src, const Rep& tss, Severity traceSeverity, UID tssId);
#endif

View File

@ -630,7 +630,14 @@ void showArena(ArenaBlock* a, ArenaBlock* parent) {
int o = a->nextBlockOffset;
while (o) {
ArenaBlockRef* r = (ArenaBlockRef*)((char*)a->getData() + o);
showArena(r->next, a);
// If alignedBuffer is valid then print its pointer and size, else recurse
if (r->aligned4kBufferSize != 0) {
printf("AlignedBuffer %p (<-%p) %u bytes\n", r->aligned4kBuffer, a, r->aligned4kBufferSize);
} else {
showArena(r->next, a);
}
o = r->nextBlockOffset;
}
}

View File

@ -536,7 +536,10 @@ public:
std::string getFilename() const override { return actualFilename; }
~SimpleFile() override { _close(h); }
~SimpleFile() override {
_close(h);
--openCount;
}
private:
int h;
@ -1015,8 +1018,8 @@ public:
// Get the size of all files we've created on the server and subtract them from the free space
for (auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) {
if (file->second.isReady()) {
totalFileSize += ((AsyncFileNonDurable*)file->second.get().getPtr())->approximateSize;
if (file->second.get().isReady()) {
totalFileSize += ((AsyncFileNonDurable*)file->second.get().get().getPtr())->approximateSize;
}
numFiles++;
}
@ -2440,7 +2443,7 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
actualFilename = filename + ".part";
auto partFile = machineCache.find(actualFilename);
if (partFile != machineCache.end()) {
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second);
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second.get());
if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
f = map(f, [=](Reference<IAsyncFile> r) {
return Reference<IAsyncFile>(new AsyncFileWriteChecker(r));
@ -2448,19 +2451,26 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
return f;
}
}
if (machineCache.find(actualFilename) == machineCache.end()) {
Future<Reference<IAsyncFile>> f;
auto itr = machineCache.find(actualFilename);
if (itr == machineCache.end()) {
// Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile.
// This way, they can both keep up with the time to start the next operation
auto diskParameters =
makeReference<DiskParameters>(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH);
machineCache[actualFilename] =
AsyncFileNonDurable::open(filename,
f = AsyncFileNonDurable::open(filename,
actualFilename,
SimpleFile::open(filename, flags, mode, diskParameters, false),
diskParameters,
(flags & IAsyncFile::OPEN_NO_AIO) == 0);
machineCache[actualFilename] = UnsafeWeakFutureReference<IAsyncFile>(f);
} else {
f = itr->second.get();
}
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(machineCache[actualFilename]);
f = AsyncFileDetachable::open(f);
if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
f = map(f, [=](Reference<IAsyncFile> r) { return Reference<IAsyncFile>(new AsyncFileWriteChecker(r)); });
return f;

View File

@ -41,7 +41,7 @@ public:
: desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1),
isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false),
allSwapsDisabled(false), backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType),
extraDB(nullptr), allowLogSetKills(true), usableRegions(1) {}
extraDB(nullptr), allowLogSetKills(true), usableRegions(1), tssMode(TSSMode::Disabled) {}
// Order matters!
enum KillType {
@ -55,6 +55,9 @@ public:
None
};
// Order matters! all modes >= 2 are fault injection modes
enum TSSMode { Disabled, EnabledNormal, EnabledAddDelay, EnabledDropMutations };
enum class BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };
// Subclasses may subclass ProcessInfo as well
@ -188,10 +191,14 @@ public:
Promise<KillType> shutdownSignal;
};
// A set of data associated with a simulated machine
struct MachineInfo {
ProcessInfo* machineProcess;
std::vector<ProcessInfo*> processes;
std::map<std::string, Future<Reference<IAsyncFile>>> openFiles;
// A map from filename to file handle for all open files on a machine
std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
std::set<std::string> deletingFiles;
std::set<std::string> closingFiles;
Optional<Standalone<StringRef>> machineId;
@ -401,6 +408,7 @@ public:
int32_t satelliteTLogWriteAntiQuorumFallback;
std::vector<Optional<Standalone<StringRef>>> primarySatelliteDcIds;
std::vector<Optional<Standalone<StringRef>>> remoteSatelliteDcIds;
TSSMode tssMode;
// Used by workloads that perform reconfigurations
int testerCount;

View File

@ -19,6 +19,7 @@
*/
#include "fdbclient/MutationList.h"
#include "fdbclient/KeyBackedTypes.h" // for key backed map codecs for tss mapping
#include "fdbclient/SystemData.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/Notified.h"
@ -64,10 +65,19 @@ void applyMetadataMutations(SpanID const& spanContext,
NotifiedVersion* commitVersion,
std::map<UID, Reference<StorageInfo>>* storageCache,
std::map<Tag, Version>* tag_popped,
std::unordered_map<UID, StorageServerInterface>* tssMapping,
bool initialCommit) {
// std::map<keyRef, vector<uint16_t>> cacheRangeInfo;
std::map<KeyRef, MutationRef> cachedRangeInfo;
// Testing Storage Server removal (clearing serverTagKey) needs to read tss server list value to determine it is a
// tss + find partner's tag to send the private mutation. Since the removeStorageServer transaction clears both the
// storage list and server tag, we have to enforce ordering, proccessing the server tag first, and postpone the
// server list clear until the end;
// Similarly, the TSS mapping change key needs to read the server list at the end of the commit
std::vector<KeyRangeRef> tssServerListToRemove;
std::vector<std::pair<UID, UID>> tssMappingToAdd;
for (auto const& m : mutations) {
//TraceEvent("MetadataMutation", dbgid).detail("M", m.toString());
if (toCommit) {
@ -95,12 +105,14 @@ void applyMetadataMutations(SpanID const& spanContext,
for (const auto& id : src) {
auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
ASSERT(!storageInfo->interf.isTss());
ASSERT(storageInfo->tag != invalidTag);
info.tags.push_back(storageInfo->tag);
info.src_info.push_back(storageInfo);
}
for (const auto& id : dest) {
auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
ASSERT(!storageInfo->interf.isTss());
ASSERT(storageInfo->tag != invalidTag);
info.tags.push_back(storageInfo->tag);
info.dest_info.push_back(storageInfo);
@ -113,6 +125,8 @@ void applyMetadataMutations(SpanID const& spanContext,
txnStateStore->set(KeyValueRef(m.param1, m.param2));
} else if (m.param1.startsWith(serverKeysPrefix)) {
if (toCommit) {
Tag tag = decodeServerTagValue(
txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get());
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivateMutation", dbgid)
@ -120,14 +134,9 @@ void applyMetadataMutations(SpanID const& spanContext,
.detail("Privatized", privatized.toString())
.detail("Server", serverKeysDecodeServer(m.param1))
.detail("TagKey", serverTagKeyFor(serverKeysDecodeServer(m.param1)))
.detail(
"Tag",
decodeServerTagValue(
txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get())
.toString());
.detail("Tag", tag.toString());
toCommit->addTag(decodeServerTagValue(
txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get()));
toCommit->addTag(tag);
toCommit->writeTypedMessage(privatized);
}
} else if (m.param1.startsWith(serverTagPrefix)) {
@ -235,6 +244,29 @@ void applyMetadataMutations(SpanID const& spanContext,
}
}
}
} else if (m.param1.startsWith(tssMappingKeys.begin)) {
if (!initialCommit) {
txnStateStore->set(KeyValueRef(m.param1, m.param2));
if (tssMapping) {
// Normally uses key backed map, so have to use same unpacking code here.
UID ssId = Codec<UID>::unpack(Tuple::unpack(m.param1.removePrefix(tssMappingKeys.begin)));
UID tssId = Codec<UID>::unpack(Tuple::unpack(m.param2));
tssMappingToAdd.push_back(std::pair(ssId, tssId));
// send private mutation to SS that it now has a TSS pair
if (toCommit) {
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
Optional<Value> tagV = txnStateStore->readValue(serverTagKeyFor(ssId)).get();
if (tagV.present()) {
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
}
}
}
}
} else if (m.param1 == databaseLockedKey || m.param1 == metadataVersionKey ||
m.param1 == mustContainSystemMutationsKey ||
m.param1.startsWith(applyMutationsBeginRange.begin) ||
@ -379,8 +411,20 @@ void applyMetadataMutations(SpanID const& spanContext,
}
}
if (serverListKeys.intersects(range)) {
if (!initialCommit)
txnStateStore->clear(range & serverListKeys);
if (!initialCommit) {
KeyRangeRef rangeToClear = range & serverListKeys;
if (rangeToClear.singleKeyRange()) {
UID id = decodeServerListKey(rangeToClear.begin);
Optional<Value> ssiV = txnStateStore->readValue(serverListKeyFor(id)).get();
if (ssiV.present() && decodeServerListValue(ssiV.get()).isTss()) {
tssServerListToRemove.push_back(rangeToClear);
} else {
txnStateStore->clear(rangeToClear);
}
} else {
txnStateStore->clear(rangeToClear);
}
}
}
if (tagLocalityListKeys.intersects(range)) {
if (!initialCommit)
@ -411,6 +455,32 @@ void applyMetadataMutations(SpanID const& spanContext,
toCommit->writeTypedMessage(privatized);
}
}
// Might be a tss removal, which doesn't store a tag there.
// Chained if is a little verbose, but avoids unecessary work
if (toCommit && !initialCommit && !serverKeysCleared.size()) {
KeyRangeRef maybeTssRange = range & serverTagKeys;
if (maybeTssRange.singleKeyRange()) {
UID id = decodeServerTagKey(maybeTssRange.begin);
Optional<Value> ssiV = txnStateStore->readValue(serverListKeyFor(id)).get();
if (ssiV.present()) {
StorageServerInterface ssi = decodeServerListValue(ssiV.get());
if (ssi.isTss()) {
Optional<Value> tagV =
txnStateStore->readValue(serverTagKeyFor(ssi.tssPairID.get())).get();
if (tagV.present()) {
MutationRef privatized = m;
privatized.param1 = maybeTssRange.begin.withPrefix(systemKeys.begin, arena);
privatized.param2 =
keyAfter(maybeTssRange.begin, arena).withPrefix(systemKeys.begin, arena);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
}
}
}
}
}
}
if (!initialCommit) {
KeyRangeRef clearRange = range & serverTagKeys;
@ -439,6 +509,19 @@ void applyMetadataMutations(SpanID const& spanContext,
if (!initialCommit)
txnStateStore->clear(range & serverTagHistoryKeys);
}
if (tssMappingKeys.intersects(range)) {
if (!initialCommit) {
KeyRangeRef rangeToClear = range & tssMappingKeys;
ASSERT(rangeToClear.singleKeyRange());
txnStateStore->clear(rangeToClear);
if (tssMapping) {
// Normally uses key backed map, so have to use same unpacking code here.
UID ssId =
Codec<UID>::unpack(Tuple::unpack(rangeToClear.begin.removePrefix(tssMappingKeys.begin)));
tssMapping->erase(ssId);
}
}
}
if (range.contains(coordinatorsKey)) {
if (!initialCommit)
txnStateStore->clear(singleKeyRange(coordinatorsKey));
@ -568,6 +651,17 @@ void applyMetadataMutations(SpanID const& spanContext,
}
}
for (KeyRangeRef& range : tssServerListToRemove) {
txnStateStore->clear(range);
}
for (auto& tssPair : tssMappingToAdd) {
// read tss server list from txn state store and add it to tss mapping
StorageServerInterface tssi =
decodeServerListValue(txnStateStore->readValue(serverListKeyFor(tssPair.second)).get().get());
(*tssMapping)[tssPair.first] = tssi;
}
// If we accumulated private mutations for cached key-ranges, we also need to
// tag them with the relevant storage servers. This is done to make the storage
// servers aware of the cached key-ranges
@ -666,6 +760,7 @@ void applyMetadataMutations(SpanID const& spanContext,
&proxyCommitData.committedVersion,
&proxyCommitData.storageCache,
&proxyCommitData.tag_popped,
&proxyCommitData.tssMapping,
initialCommit);
}
@ -695,5 +790,6 @@ void applyMetadataMutations(SpanID const& spanContext,
/* commitVersion= */ nullptr,
/* storageCache= */ nullptr,
/* tag_popped= */ nullptr,
/* tssMapping= */ nullptr,
/* initialCommit= */ false);
}

View File

@ -103,6 +103,8 @@ set(FDBSERVER_SRCS
TesterInterface.actor.h
TLogInterface.h
TLogServer.actor.cpp
TSSMappingUtil.actor.h
TSSMappingUtil.actor.cpp
VersionedBTree.actor.cpp
VFSAsync.h
VFSAsync.cpp

View File

@ -458,6 +458,38 @@ public:
}
}
// Log the reason why the worker is considered as unavailable.
void logWorkerUnavailable(const Severity severity,
const UID& id,
const std::string& method,
const std::string& reason,
const WorkerDetails& details,
const ProcessClass::Fitness& fitness,
const std::set<Optional<Key>>& dcIds) {
// Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose.
std::string dcList;
for (const auto& dc : dcIds) {
if (!dcList.empty()) {
dcList += ',';
}
dcList += printable(dc);
}
// Logging every possible options is a lot for every recruitment; logging all of the options with GoodFit or
// BestFit may work because there should only be like 30 tlog class processes. Plus, the recruitment happens
// only during initial database creation and recovery. So these trace events should be sparse.
if (fitness == ProcessClass::GoodFit || fitness == ProcessClass::BestFit ||
fitness == ProcessClass::NeverAssign) {
TraceEvent(severity, "GetTLogTeamWorkerUnavailable", id)
.detail("TLogRecruitMethod", method)
.detail("Reason", reason)
.detail("WorkerID", details.interf.id())
.detail("WorkerDC", details.interf.locality.dcId())
.detail("Address", details.interf.addresses().toString())
.detail("Fitness", fitness)
.detail("RecruitmentDcIds", dcList);
}
}
// A TLog recruitment method specialized for three_data_hall and three_datacenter configurations
// It attempts to evenly recruit processes from across data_halls or datacenters
std::vector<WorkerDetails> getWorkersForTlogsComplex(DatabaseConfiguration const& conf,
@ -478,11 +510,37 @@ public:
auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
exclusionWorkerIds.end() ||
!workerAvailable(worker_info, checkStable) ||
conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
(!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) ||
(!allowDegraded && worker_details.degraded)) {
exclusionWorkerIds.end()) {
logWorkerUnavailable(SevInfo, id, "complex", "Worker is excluded", worker_details, fitness, dcIds);
continue;
}
if (!workerAvailable(worker_info, checkStable)) {
logWorkerUnavailable(SevInfo, id, "complex", "Worker is not available", worker_details, fitness, dcIds);
continue;
}
if (conf.isExcludedServer(worker_details.interf.addresses())) {
logWorkerUnavailable(SevInfo,
id,
"complex",
"Worker server is excluded from the cluster",
worker_details,
fitness,
dcIds);
continue;
}
if (fitness == ProcessClass::NeverAssign) {
logWorkerUnavailable(
SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
continue;
}
if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
logWorkerUnavailable(
SevDebug, id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds);
continue;
}
if (!allowDegraded && worker_details.degraded) {
logWorkerUnavailable(
SevInfo, id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds);
continue;
}
@ -685,11 +743,34 @@ public:
for (const auto& [worker_process_id, worker_info] : id_worker) {
const auto& worker_details = worker_info.details;
auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
exclusionWorkerIds.end() ||
!workerAvailable(worker_info, checkStable) ||
conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
(!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
exclusionWorkerIds.end()) {
logWorkerUnavailable(SevInfo, id, "simple", "Worker is excluded", worker_details, fitness, dcIds);
continue;
}
if (!workerAvailable(worker_info, checkStable)) {
logWorkerUnavailable(SevInfo, id, "simple", "Worker is not available", worker_details, fitness, dcIds);
continue;
}
if (conf.isExcludedServer(worker_details.interf.addresses())) {
logWorkerUnavailable(SevInfo,
id,
"simple",
"Worker server is excluded from the cluster",
worker_details,
fitness,
dcIds);
continue;
}
if (fitness == ProcessClass::NeverAssign) {
logWorkerUnavailable(
SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
continue;
}
if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
logWorkerUnavailable(
SevDebug, id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds);
continue;
}
@ -794,11 +875,35 @@ public:
for (const auto& [worker_process_id, worker_info] : id_worker) {
const auto& worker_details = worker_info.details;
auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
exclusionWorkerIds.end() ||
!workerAvailable(worker_info, checkStable) ||
conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
(!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
exclusionWorkerIds.end()) {
logWorkerUnavailable(SevInfo, id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds);
continue;
}
if (!workerAvailable(worker_info, checkStable)) {
logWorkerUnavailable(
SevInfo, id, "deprecated", "Worker is not available", worker_details, fitness, dcIds);
continue;
}
if (conf.isExcludedServer(worker_details.interf.addresses())) {
logWorkerUnavailable(SevInfo,
id,
"deprecated",
"Worker server is excluded from the cluster",
worker_details,
fitness,
dcIds);
continue;
}
if (fitness == ProcessClass::NeverAssign) {
logWorkerUnavailable(
SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
continue;
}
if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
logWorkerUnavailable(
SevDebug, id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds);
continue;
}
@ -3091,9 +3196,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
cluster->masterProcessId = Optional<Key>();
}
TraceEvent("ClusterControllerWorkerFailed", cluster->id)
.detail("ProcessId", worker.locality.processId())
.detail("ProcessClass", failedWorkerInfo.details.processClass.toString())
.detail("Address", worker.address());
.detail("ProcessId", worker.locality.processId())
.detail("ProcessClass", failedWorkerInfo.details.processClass.toString())
.detail("Address", worker.address());
cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
cluster->id_worker.erase(worker.locality.processId());
cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());
@ -3277,6 +3382,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
if (db->clientInfo->get().commitProxies != req.commitProxies ||
db->clientInfo->get().grvProxies != req.grvProxies) {
isChanged = true;
// TODO why construct a new one and not just copy the old one and change proxies + id?
ClientDBInfo clientInfo;
clientInfo.id = deterministicRandom()->randomUniqueID();
clientInfo.commitProxies = req.commitProxies;
@ -3769,7 +3875,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
state ClientDBInfo clientInfo = db->clientInfo->get();
state ClientDBInfo clientInfo = db->serverInfo->get().client;
if (globalConfigVersion.present()) {
// Since the history keys end with versionstamps, they
@ -3827,6 +3933,14 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
}
clientInfo.id = deterministicRandom()->randomUniqueID();
// Update ServerDBInfo so fdbserver processes receive updated history.
ServerDBInfo serverInfo = db->serverInfo->get();
serverInfo.id = deterministicRandom()->randomUniqueID();
serverInfo.infoGeneration = ++db->dbInfoCount;
serverInfo.client = clientInfo;
db->serverInfo->set(serverInfo);
// Update ClientDBInfo so client processes receive updated history.
db->clientInfo->set(clientInfo);
}
@ -4306,6 +4420,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(handleForcedRecoveries(&self, interf));
self.addActor.send(monitorDataDistributor(&self));
self.addActor.send(monitorRatekeeper(&self));
// self.addActor.send(monitorTSSMapping(&self));
self.addActor.send(dbInfoUpdater(&self));
self.addActor.send(traceCounters("ClusterControllerMetrics",
self.id,

View File

@ -1430,11 +1430,26 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
return Void();
}
// Add tss mapping data to the reply, if any of the included storage servers have a TSS pair
void maybeAddTssMapping(GetKeyServerLocationsReply& reply,
ProxyCommitData* commitData,
std::unordered_set<UID>& included,
UID ssId) {
if (!included.count(ssId)) {
auto mappingItr = commitData->tssMapping.find(ssId);
if (mappingItr != commitData->tssMapping.end()) {
included.insert(ssId);
reply.resultsTssMapping.push_back(*mappingItr);
}
}
}
ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
// We can't respond to these requests until we have valid txnStateStore
wait(commitData->validState.getFuture());
wait(delay(0, TaskPriority::DefaultEndpoint));
std::unordered_set<UID> tssMappingsIncluded;
GetKeyServerLocationsReply rep;
if (!req.end.present()) {
auto r = req.reverse ? commitData->keyInfo.rangeContainingKeyBefore(req.begin)
@ -1443,6 +1458,7 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
ssis.reserve(r.value().src_info.size());
for (auto& it : r.value().src_info) {
ssis.push_back(it->interf);
maybeAddTssMapping(rep, commitData, tssMappingsIncluded, it->interf.id());
}
rep.results.push_back(std::make_pair(r.range(), ssis));
} else if (!req.reverse) {
@ -1454,6 +1470,7 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
ssis.reserve(r.value().src_info.size());
for (auto& it : r.value().src_info) {
ssis.push_back(it->interf);
maybeAddTssMapping(rep, commitData, tssMappingsIncluded, it->interf.id());
}
rep.results.push_back(std::make_pair(r.range(), ssis));
count++;
@ -1466,6 +1483,7 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
ssis.reserve(r.value().src_info.size());
for (auto& it : r.value().src_info) {
ssis.push_back(it->interf);
maybeAddTssMapping(rep, commitData, tssMappingsIncluded, it->interf.id());
}
rep.results.push_back(std::make_pair(r.range(), ssis));
if (r == commitData->keyInfo.ranges().begin()) {

View File

@ -406,8 +406,8 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
// If the current leader's priority became worse, we still need to notified all clients because now one
// of them might be better than the leader. In addition, even though FitnessRemote is better than
// FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to switch
// from passively monitoring the leader to actively attempting to become the leader.
// FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to
// switch from passively monitoring the leader to actively attempting to become the leader.
if (!currentNominee.present() || !nextNominee.present() ||
!currentNominee.get().equalInternalId(nextNominee.get()) ||
nextNominee.get() > currentNominee.get() ||
@ -545,15 +545,30 @@ struct LeaderRegisterCollection {
}
};
// extract the prefix descriptor from cluster id
StringRef getClusterDescriptor(Key key) {
StringRef str = key.contents();
return str.eat(":");
}
// leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface,
// creating and destroying them on demand.
ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id) {
ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf,
OnDemandStore* pStore,
UID id,
Reference<ClusterConnectionFile> ccf) {
state LeaderRegisterCollection regs(pStore);
state ActorCollection forwarders(false);
wait(LeaderRegisterCollection::init(&regs));
loop choose {
when(CheckDescriptorMutableRequest req = waitNext(interf.checkDescriptorMutable.getFuture())) {
// Note the response returns the value of a knob enforced by checking only one coordinator. It is not
// quorum based.
CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
req.reply.send(rep);
}
when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.clusterKey);
if (forward.present()) {
@ -562,7 +577,18 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
info.forward = forward.get().serializedInfo;
req.reply.send(CachedSerialization<ClientDBInfo>(info));
} else {
regs.getInterface(req.clusterKey, id).openDatabase.send(req);
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
getClusterDescriptor(req.clusterKey).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "OpenDatabaseCoordRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.clusterKey)
.detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.clusterKey, id).openDatabase.send(req);
}
}
}
when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) {
@ -570,38 +596,89 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
if (forward.present()) {
req.reply.send(forward.get());
} else {
regs.getInterface(req.key, id).electionResult.send(req);
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "ElectionResultRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key)
.detail("ClusterKey", ccf->getConnectionString().clusterKey())
.detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).electionResult.send(req);
}
}
}
when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(forward.get());
else
regs.getInterface(req.key, id).getLeader.send(req);
else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "GetLeaderRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key)
.detail("ClusterKey", ccf->getConnectionString().clusterKey());
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).getLeader.send(req);
}
}
}
when(CandidacyRequest req = waitNext(interf.candidacy.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(forward.get());
else
regs.getInterface(req.key, id).candidacy.send(req);
else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "CandidacyRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key);
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).candidacy.send(req);
}
}
}
when(LeaderHeartbeatRequest req = waitNext(interf.leaderHeartbeat.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(LeaderHeartbeatReply{ false });
else
regs.getInterface(req.key, id).leaderHeartbeat.send(req);
else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "LeaderHeartbeatRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key);
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).leaderHeartbeat.send(req);
}
}
}
when(ForwardRequest req = waitNext(interf.forward.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(Void());
else {
forwarders.add(
LeaderRegisterCollection::setForward(&regs, req.key, ClusterConnectionString(req.conn.toString())));
regs.getInterface(req.key, id).forward.send(req);
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "ForwardRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key);
req.reply.sendError(wrong_connection_file());
} else {
forwarders.add(LeaderRegisterCollection::setForward(
&regs, req.key, ClusterConnectionString(req.conn.toString())));
regs.getInterface(req.key, id).forward.send(req);
}
}
}
when(wait(forwarders.getResult())) {
@ -611,7 +688,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
}
}
ACTOR Future<Void> coordinationServer(std::string dataFolder) {
ACTOR Future<Void> coordinationServer(std::string dataFolder, Reference<ClusterConnectionFile> ccf) {
state UID myID = deterministicRandom()->randomUniqueID();
state LeaderElectionRegInterface myLeaderInterface(g_network);
state GenerationRegInterface myInterface(g_network);
@ -622,7 +699,7 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder) {
.detail("Folder", dataFolder);
try {
wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID) ||
wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID, ccf) ||
store.getError());
throw internal_error();
} catch (Error& e) {

View File

@ -225,6 +225,6 @@ public:
vector<GenerationRegInterface> stateServers;
};
Future<Void> coordinationServer(std::string const& dataFolder);
Future<Void> coordinationServer(std::string const& dataFolder, Reference<ClusterConnectionFile> const& ccf);
#endif

View File

@ -66,6 +66,8 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
Future<std::pair<StorageServerInterface, ProcessClass>> onInterfaceChanged;
Promise<Void> removed;
Future<Void> onRemoved;
Future<Void> onTSSPairRemoved;
Promise<Void> killTss;
Promise<Void> wakeUpTracker;
bool inDesiredDC;
LocalityEntry localityEntry;
@ -83,8 +85,10 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
Reference<LocalitySet> storageServerSet)
: id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass),
dataInFlightToServer(0), onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()),
inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END) {
localityEntry = ((LocalityMap<UID>*)storageServerSet.getPtr())->add(ssi.locality, &id);
inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END), onTSSPairRemoved(Never()) {
if (!ssi.isTss()) {
localityEntry = ((LocalityMap<UID>*)storageServerSet.getPtr())->add(ssi.locality, &id);
}
}
bool isCorrectStoreType(KeyValueStoreType configStoreType) {
@ -398,6 +402,7 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
state std::map<UID, Optional<Key>> server_dc;
state std::map<vector<UID>, std::pair<vector<UID>, vector<UID>>> team_cache;
state std::vector<std::pair<StorageServerInterface, ProcessClass>> tss_servers;
// Get the server list in its own try/catch block since it modifies result. We don't want a subsequent failure
// causing entries to be duplicated
@ -447,8 +452,12 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
for (int i = 0; i < serverList.get().size(); i++) {
auto ssi = decodeServerListValue(serverList.get()[i].value);
result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
server_dc[ssi.id()] = ssi.locality.dcId();
if (!ssi.isTss()) {
result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
server_dc[ssi.id()] = ssi.locality.dcId();
} else {
tss_servers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
}
}
break;
@ -559,6 +568,11 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
// a dummy shard at the end with no keys or servers makes life easier for trackInitialShards()
result->shards.push_back(DDShardInfo(allKeys.end));
// add tss to server list AFTER teams are built
for (auto& it : tss_servers) {
result->allServers.push_back(it);
}
return result;
}
@ -567,7 +581,8 @@ ACTOR Future<Void> storageServerTracker(struct DDTeamCollection* self,
TCServerInfo* server,
Promise<Void> errorOut,
Version addedVersion,
const DDEnabledState* ddEnabledState);
const DDEnabledState* ddEnabledState,
bool isTss);
Future<Void> teamTracker(struct DDTeamCollection* const& self,
Reference<TCTeamInfo> const& team,
@ -598,6 +613,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int64_t unhealthyServers;
std::map<int,int> priority_teams;
std::map<UID, Reference<TCServerInfo>> server_info;
std::map<UID, Reference<TCServerInfo>> tss_info_by_pair;
std::map<UID, Reference<TCServerInfo>> server_and_tss_info; // TODO could replace this with an efficient way to do a read-only concatenation of 2 data structures?
std::map<Key, int> lagging_zones; // zone to number of storage servers lagging
AsyncVar<bool> disableFailingLaggingServers;
@ -610,7 +627,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
vector<Reference<TCTeamInfo>> badTeams;
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
PromiseStream<UID> removedServers;
std::set<UID> recruitingIds; // The IDs of the SS which are being recruited
PromiseStream<UID> removedTSS;
std::set<UID> recruitingIds; // The IDs of the SS/TSS which are being recruited
std::set<NetworkAddress> recruitingLocalities;
Future<Void> initialFailureReactionDelay;
Future<Void> initializationDoneActor;
@ -624,6 +642,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int optimalTeamCount;
AsyncVar<bool> zeroOptimalTeams;
bool isTssRecruiting; // If tss recruiting is waiting on a pair, don't consider DD recruiting for the purposes of QuietDB
// EXCLUDED if an address is in the excluded list in the database.
// FAILED if an address is permanently failed.
// NONE by default. Updated asynchronously (eventually)
@ -709,7 +729,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs),
zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary),
zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), isTssRecruiting(false),
medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0),
processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0),
getShardMetrics(getShardMetrics), removeFailedServer(removeFailedServer) {
@ -758,10 +778,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// The following makes sure that, even if a reference to a team is held in the DD Queue, the tracker will be
// stopped
// before the server_status map to which it has a pointer, is destroyed.
for (auto& [_, info] : server_info) {
for (auto& [_, info] : server_and_tss_info) {
info->tracker.cancel();
info->collection = nullptr;
}
// TraceEvent("DDTeamCollectionDestructed", distributorId)
// .detail("Primary", primary)
// .detail("ServerTrackerDestroyed", server_info.size());
@ -1128,6 +1149,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
self->healthyZone.set(initTeams->initHealthyZoneValue);
// SOMEDAY: If some servers have teams and not others (or some servers have more data than others) and there is
// an address/locality collision, should we preferentially mark the least used server as undesirable?
for (auto i = initTeams->allServers.begin(); i != initTeams->allServers.end(); ++i) {
if (self->shouldHandleServer(i->first)) {
if (!self->isValidLocality(self->configuration.storagePolicy, i->first.locality)) {
@ -2419,14 +2441,18 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
if (!shouldHandleServer(newServer)) {
return;
}
allServers.push_back(newServer.id());
TraceEvent("AddedStorageServer", distributorId)
if (!newServer.isTss()) {
allServers.push_back(newServer.id());
}
TraceEvent(newServer.isTss() ? "AddedTSS" : "AddedStorageServer", distributorId)
.detail("ServerID", newServer.id())
.detail("ProcessClass", processClass.toString())
.detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token)
.detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress());
auto& r = server_info[newServer.id()] = makeReference<TCServerInfo>(
auto& r = server_and_tss_info[newServer.id()] = makeReference<TCServerInfo>(
newServer,
this,
processClass,
@ -2434,12 +2460,33 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(),
storageServerSet);
// Establish the relation between server and machine
checkAndCreateMachine(r);
if (newServer.isTss()) {
tss_info_by_pair[newServer.tssPairID.get()] = r;
r->tracker = storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState);
doBuildTeams = true; // Adding a new server triggers to build new teams
restartTeamBuilder.trigger();
if (server_info.count(newServer.tssPairID.get())) {
r->onTSSPairRemoved = server_info[newServer.tssPairID.get()]->onRemoved;
}
} else {
server_info[newServer.id()] = r;
// Establish the relation between server and machine
checkAndCreateMachine(r);
}
r->tracker =
storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss());
if (!newServer.isTss()) {
// link and wake up tss' tracker so it knows when this server gets removed
if (tss_info_by_pair.count(newServer.id())) {
tss_info_by_pair[newServer.id()]->onTSSPairRemoved = r->onRemoved;
if (tss_info_by_pair[newServer.id()]->wakeUpTracker.canBeSet()) {
tss_info_by_pair[newServer.id()]->wakeUpTracker.send(Void());
}
}
doBuildTeams = true; // Adding a new server triggers to build new teams
restartTeamBuilder.trigger();
}
}
bool removeTeam(Reference<TCTeamInfo> team) {
@ -2605,6 +2652,17 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
return foundMachineTeam;
}
void removeTSS(UID removedServer) {
// much simpler than remove server. tss isn't in any teams, so just remove it from data structures
TraceEvent("RemovedTSS", distributorId).detail("ServerID", removedServer);
Reference<TCServerInfo> removedServerInfo = server_and_tss_info[removedServer];
tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID.get());
server_and_tss_info.erase(removedServer);
server_status.clear(removedServer);
}
void removeServer(UID removedServer) {
TraceEvent("RemovedStorageServer", distributorId).detail("ServerID", removedServer);
@ -2703,6 +2761,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
}
}
server_info.erase(removedServer);
server_and_tss_info.erase(removedServer);
if (server_status.get(removedServer).initialized && server_status.get(removedServer).isUnhealthy()) {
unhealthyServers--;
@ -2726,7 +2785,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
};
TCServerInfo::~TCServerInfo() {
if (collection && ssVersionTooFarBehind.get()) {
if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss()) {
collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get());
}
}
@ -3359,6 +3418,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
.detail("IsReady", self->initialFailureReactionDelay.isReady());
self->traceTeamCollectionInfo();
}
// Check if the number of degraded machines has changed
state vector<Future<Void>> change;
bool anyUndesired = false;
@ -3400,6 +3460,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
bool containsFailed = teamContainsFailedServer(self, team);
bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() ||
(lastZeroHealthy && !self->zeroHealthyTeams->get()) || containsFailed);
// TraceEvent("TeamHealthChangeDetected", self->distributorId)
// .detail("Team", team->getDesc())
// .detail("ServersLeft", serversLeft)
@ -3764,8 +3825,8 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
ProcessClass const& processClass = results[i].second;
if (!self->shouldHandleServer(ssi)) {
continue;
} else if (self->server_info.count(serverId)) {
auto& serverInfo = self->server_info[serverId];
} else if (self->server_and_tss_info.count(serverId)) {
auto& serverInfo = self->server_and_tss_info[serverId];
if (ssi.getValue.getEndpoint() != serverInfo->lastKnownInterface.getValue.getEndpoint() ||
processClass != serverInfo->lastKnownClass.classType()) {
Promise<std::pair<StorageServerInterface, ProcessClass>> currentInterfaceChanged =
@ -3783,7 +3844,9 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
self->serverTrackerErrorOut,
tr.getReadVersion().get(),
ddEnabledState);
self->doBuildTeams = true;
if (!ssi.isTss()) {
self->doBuildTeams = true;
}
}
}
@ -3886,16 +3949,17 @@ ACTOR Future<Void> keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo
}
ACTOR Future<Void> waitForAllDataRemoved(Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams) {
state Transaction tr(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Version ver = wait(tr.getReadVersion());
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Version ver = wait(tr->getReadVersion());
// we cannot remove a server immediately after adding it, because a perfectly timed master recovery could
// cause us to not store the mutations sent to the short lived storage server.
if (ver > addedVersion + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
bool canRemove = wait(canRemoveStorageServer(&tr, serverID));
bool canRemove = wait(canRemoveStorageServer(tr, serverID));
// TraceEvent("WaitForAllDataRemoved")
// .detail("Server", serverID)
// .detail("CanRemove", canRemove)
@ -3908,9 +3972,9 @@ ACTOR Future<Void> waitForAllDataRemoved(Database cx, UID serverID, Version adde
// Wait for any change to the serverKeys for this server
wait(delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution));
tr.reset();
tr->reset();
} catch (Error& e) {
wait(tr.onError(e));
wait(tr->onError(e));
}
}
}
@ -3941,16 +4005,18 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
}
}
if (self->server_status.get(interf.id()).initialized) {
bool unhealthy = self->server_status.get(interf.id()).isUnhealthy();
if (unhealthy && !status->isUnhealthy()) {
self->unhealthyServers--;
}
if (!unhealthy && status->isUnhealthy()) {
if (!interf.isTss()) {
if (self->server_status.get(interf.id()).initialized) {
bool unhealthy = self->server_status.get(interf.id()).isUnhealthy();
if (unhealthy && !status->isUnhealthy()) {
self->unhealthyServers--;
}
if (!unhealthy && status->isUnhealthy()) {
self->unhealthyServers++;
}
} else if (status->isUnhealthy()) {
self->unhealthyServers++;
}
} else if (status->isUnhealthy()) {
self->unhealthyServers++;
}
self->server_status.set(interf.id(), *status);
@ -3971,7 +4037,7 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
choose {
when(wait(healthChanged)) {
status->isFailed = !status->isFailed;
if (!status->isFailed &&
if (!status->isFailed && !server->lastKnownInterface.isTss() &&
(server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
self->doBuildTeams = true;
}
@ -4014,7 +4080,9 @@ ACTOR Future<Void> storageServerTracker(
TCServerInfo* server, // This actor is owned by this TCServerInfo, point to server_info[id]
Promise<Void> errorOut,
Version addedVersion,
const DDEnabledState* ddEnabledState) {
const DDEnabledState* ddEnabledState,
bool isTss) {
state Future<Void> failureTracker;
state ServerStatus status(false, false, server->lastKnownInterface.locality);
state bool lastIsUnhealthy = false;
@ -4022,7 +4090,7 @@ ACTOR Future<Void> storageServerTracker(
state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged = server->onInterfaceChanged;
state Future<Void> storeTypeTracker = keyValueStoreTypeTracker(self, server);
state Future<Void> storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server);
state bool hasWrongDC = !isCorrectDC(self, server);
state bool hasInvalidLocality =
!self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality);
@ -4042,7 +4110,7 @@ ACTOR Future<Void> storageServerTracker(
// dcLocation, interface) is changed.
state std::vector<Future<Void>> otherChanges;
std::vector<Promise<Void>> wakeUpTrackers;
for (const auto& i : self->server_info) {
for (const auto& i : self->server_and_tss_info) {
if (i.second.getPtr() != server &&
i.second->lastKnownInterface.address() == server->lastKnownInterface.address()) {
auto& statusInfo = self->server_status.get(i.first);
@ -4144,11 +4212,11 @@ ACTOR Future<Void> storageServerTracker(
.detail("Excluded", worstAddr.toString());
status.isUndesired = true;
status.isWrongConfiguration = true;
if (worstStatus == DDTeamCollection::Status::FAILED) {
if (worstStatus == DDTeamCollection::Status::FAILED && !isTss) {
TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
.detail("Server", server->id)
.detail("Excluded", worstAddr.toString());
wait(delay(0.0)); //Do not throw an error while still inside trackExcludedServers
wait(delay(0.0)); // Do not throw an error while still inside trackExcludedServers
while (!ddEnabledState->isDDEnabled()) {
wait(delay(1.0));
}
@ -4165,7 +4233,7 @@ ACTOR Future<Void> storageServerTracker(
self->restartRecruiting.trigger();
}
if (lastIsUnhealthy && !status.isUnhealthy() &&
if (lastIsUnhealthy && !status.isUnhealthy() && !isTss &&
(server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
self->doBuildTeams = true;
self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
@ -4174,7 +4242,7 @@ ACTOR Future<Void> storageServerTracker(
state bool recordTeamCollectionInfo = false;
choose {
when(wait(failureTracker)) {
when(wait(failureTracker || server->onTSSPairRemoved || server->killTss.getFuture())) {
// The server is failed AND all data has been removed from it, so permanently remove it.
TraceEvent("StatusMapChange", self->distributorId)
.detail("ServerID", server->id)
@ -4185,7 +4253,8 @@ ACTOR Future<Void> storageServerTracker(
}
// Remove server from FF/serverList
wait(removeStorageServer(cx, server->id, self->lock, ddEnabledState));
wait(removeStorageServer(
cx, server->id, server->lastKnownInterface.tssPairID, self->lock, ddEnabledState));
TraceEvent("StatusMapChange", self->distributorId)
.detail("ServerID", server->id)
@ -4193,7 +4262,11 @@ ACTOR Future<Void> storageServerTracker(
// Sets removeSignal (alerting dataDistributionTeamCollection to remove the storage server from its
// own data structures)
server->removed.send(Void());
self->removedServers.send(server->id);
if (isTss) {
self->removedTSS.send(server->id);
} else {
self->removedServers.send(server->id);
}
return Void();
}
when(std::pair<StorageServerInterface, ProcessClass> newInterface = wait(interfaceChanged)) {
@ -4211,7 +4284,7 @@ ACTOR Future<Void> storageServerTracker(
server->lastKnownInterface = newInterface.first;
server->lastKnownClass = newInterface.second;
if (localityChanged) {
if (localityChanged && !isTss) {
TEST(true); // Server locality changed
// The locality change of a server will affect machine teams related to the server if
@ -4303,7 +4376,7 @@ ACTOR Future<Void> storageServerTracker(
recordTeamCollectionInfo = true;
// Restart the storeTracker for the new interface. This will cancel the previous
// keyValueStoreTypeTracker
storeTypeTracker = keyValueStoreTypeTracker(self, server);
storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server);
hasWrongDC = !isCorrectDC(self, server);
hasInvalidLocality =
!self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality);
@ -4350,6 +4423,7 @@ ACTOR Future<Void> storageServerTracker(
// Monitor whether or not storage servers are being recruited. If so, then a database cannot be considered quiet
ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
state bool recruiting = false;
state bool lastIsTss = false;
TraceEvent("StorageServerRecruitment", self->distributorId)
.detail("State", "Idle")
.trackLatest("StorageServerRecruitment_" + self->distributorId.toString());
@ -4360,12 +4434,22 @@ ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
}
TraceEvent("StorageServerRecruitment", self->distributorId)
.detail("State", "Recruiting")
.detail("IsTSS", self->isTssRecruiting ? "True" : "False")
.trackLatest("StorageServerRecruitment_" + self->distributorId.toString());
recruiting = true;
lastIsTss = self->isTssRecruiting;
} else {
loop {
choose {
when(wait(self->recruitingStream.onChange())) {}
when(wait(self->recruitingStream.onChange())) {
if (lastIsTss != self->isTssRecruiting) {
TraceEvent("StorageServerRecruitment", self->distributorId)
.detail("State", "Recruiting")
.detail("IsTSS", self->isTssRecruiting ? "True" : "False")
.trackLatest("StorageServerRecruitment_" + self->distributorId.toString());
lastIsTss = self->isTssRecruiting;
}
}
when(wait(self->recruitingStream.get() == 0
? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution)
: Future<Void>(Never()))) {
@ -4445,7 +4529,7 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
int numExistingSS = 0;
for (auto& server : self->server_info) {
for (auto& server : self->server_and_tss_info) {
const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress();
AddressExclusion usedAddr(netAddr.ip, netAddr.port);
if (usedAddr == addr) {
@ -4456,9 +4540,94 @@ int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
return numExistingSS;
}
// All state that represents an ongoing tss pair recruitment
struct TSSPairState : ReferenceCounted<TSSPairState>, NonCopyable {
Promise<Optional<std::pair<UID, Version>>>
ssPairInfo; // if set, for ss to pass its id to tss pair once it is successfully recruited
Promise<bool> tssPairDone; // if set, for tss to pass ss that it was successfully recruited
Promise<Void> complete;
Optional<Key> dcId; // dc
Optional<Key> dataHallId; // data hall
bool active;
TSSPairState() : active(false) {}
TSSPairState(const LocalityData& locality)
: active(true), dcId(locality.dcId()), dataHallId(locality.dataHallId()) {}
bool inDataZone(const LocalityData& locality) {
return locality.dcId() == dcId && locality.dataHallId() == dataHallId;
}
void cancel() {
// only cancel if both haven't been set, otherwise one half of pair could think it was successful but the other
// half would think it failed
if (active && ssPairInfo.canBeSet() && tssPairDone.canBeSet()) {
ssPairInfo.send(Optional<std::pair<UID, Version>>());
// callback of ssPairInfo could have cancelled tssPairDone already, so double check before cancelling
if (tssPairDone.canBeSet()) {
tssPairDone.send(false);
}
if (complete.canBeSet()) {
complete.send(Void());
}
}
}
bool tssRecruitSuccess() {
if (active && tssPairDone.canBeSet()) {
tssPairDone.send(true);
return true;
}
return false;
}
bool tssRecruitFailed() {
if (active && tssPairDone.canBeSet()) {
tssPairDone.send(false);
return true;
}
return false;
}
bool ssRecruitSuccess(std::pair<UID, Version> ssInfo) {
if (active && ssPairInfo.canBeSet()) {
ssPairInfo.send(Optional<std::pair<UID, Version>>(ssInfo));
return true;
}
return false;
}
bool ssRecruitFailed() {
if (active && ssPairInfo.canBeSet()) {
ssPairInfo.send(Optional<std::pair<UID, Version>>());
return true;
}
return false;
}
bool markComplete() {
if (active && complete.canBeSet()) {
complete.send(Void());
return true;
}
return false;
}
Future<Optional<std::pair<UID, Version>>> waitOnSS() { return ssPairInfo.getFuture(); }
Future<bool> waitOnTSS() { return tssPairDone.getFuture(); }
Future<Void> waitComplete() { return complete.getFuture(); }
};
ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
RecruitStorageReply candidateWorker,
const DDEnabledState* ddEnabledState) {
const DDEnabledState* ddEnabledState,
bool recruitTss,
Reference<TSSPairState> tssState) {
// SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes
self->recruitingStream.set(self->recruitingStream.get() + 1);
@ -4470,12 +4639,48 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
// too many storage server on the same address (i.e., process) can cause OOM.
// Ask the candidateWorker to initialize a SS only if the worker does not have a pending request
state UID interfaceId = deterministicRandom()->randomUniqueID();
InitializeStorageRequest isr;
isr.storeType = self->configuration.storageServerStoreType;
state InitializeStorageRequest isr;
isr.storeType =
recruitTss ? self->configuration.testingStorageServerStoreType : self->configuration.storageServerStoreType;
isr.seedTag = invalidTag;
isr.reqId = deterministicRandom()->randomUniqueID();
isr.interfaceId = interfaceId;
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
// if tss, wait for pair ss to finish and add its id to isr. If pair fails, don't recruit tss
state bool doRecruit = true;
if (recruitTss) {
TraceEvent("TSS_Recruit", self->distributorId)
.detail("TSSID", interfaceId)
.detail("Stage", "TSSWaitingPair")
.detail("Addr", candidateWorker.worker.address())
.detail("Locality", candidateWorker.worker.locality.toString());
Optional<std::pair<UID, Version>> ssPairInfoResult = wait(tssState->waitOnSS());
if (ssPairInfoResult.present()) {
isr.tssPairIDAndVersion = ssPairInfoResult.get();
TraceEvent("TSS_Recruit", self->distributorId)
.detail("SSID", ssPairInfoResult.get().first)
.detail("TSSID", interfaceId)
.detail("Stage", "TSSWaitingPair")
.detail("Addr", candidateWorker.worker.address())
.detail("Version", ssPairInfoResult.get().second)
.detail("Locality", candidateWorker.worker.locality.toString());
} else {
doRecruit = false;
TraceEvent(SevWarnAlways, "TSS_RecruitError", self->distributorId)
.detail("TSSID", interfaceId)
.detail("Reason", "SS recruitment failed for some reason")
.detail("Addr", candidateWorker.worker.address())
.detail("Locality", candidateWorker.worker.locality.toString());
}
}
TraceEvent("DDRecruiting")
.detail("Primary", self->primary)
.detail("State", "Sending request to worker")
@ -4483,19 +4688,53 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
.detail("WorkerLocality", candidateWorker.worker.locality.toString())
.detail("Interf", interfaceId)
.detail("Addr", candidateWorker.worker.address())
.detail("TSS", recruitTss ? "true" : "false")
.detail("RecruitingStream", self->recruitingStream.get());
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
state ErrorOr<InitializeStorageReply> newServer =
wait(candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution));
if (newServer.isError()) {
Future<ErrorOr<InitializeStorageReply>> fRecruit =
doRecruit ? candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution)
: Future<ErrorOr<InitializeStorageReply>>(ErrorOr<InitializeStorageReply>(recruitment_failed()));
state ErrorOr<InitializeStorageReply> newServer = wait(fRecruit);
if (doRecruit && newServer.isError()) {
TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError());
if (!newServer.isError(error_code_recruitment_failed) &&
!newServer.isError(error_code_request_maybe_delivered))
throw newServer.getError();
wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution));
}
if (!recruitTss && newServer.present() &&
tssState->ssRecruitSuccess(std::pair(interfaceId, newServer.get().addedVersion))) {
// SS has a tss pair. send it this id, but try to wait for add server until tss is recruited
TraceEvent("TSS_Recruit", self->distributorId)
.detail("SSID", interfaceId)
.detail("Stage", "SSSignaling")
.detail("Addr", candidateWorker.worker.address())
.detail("Locality", candidateWorker.worker.locality.toString());
// wait for timeout, but eventually move on if no TSS pair recruited
Optional<bool> tssSuccessful = wait(timeout(tssState->waitOnTSS(), SERVER_KNOBS->TSS_RECRUITMENT_TIMEOUT));
if (tssSuccessful.present() && tssSuccessful.get()) {
TraceEvent("TSS_Recruit", self->distributorId)
.detail("SSID", interfaceId)
.detail("Stage", "SSGotPair")
.detail("Addr", candidateWorker.worker.address())
.detail("Locality", candidateWorker.worker.locality.toString());
} else {
TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId)
.detail("SSID", interfaceId)
.detail("Reason",
tssSuccessful.present() ? "TSS recruitment failed for some reason"
: "TSS recruitment timed out")
.detail("Addr", candidateWorker.worker.address())
.detail("Locality", candidateWorker.worker.locality.toString());
}
}
self->recruitingIds.erase(interfaceId);
self->recruitingLocalities.erase(candidateWorker.worker.stableAddress());
@ -4509,26 +4748,43 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
.detail("RecruitingStream", self->recruitingStream.get());
if (newServer.present()) {
if (!self->server_info.count(newServer.get().interf.id()))
self->addServer(newServer.get().interf,
candidateWorker.processClass,
self->serverTrackerErrorOut,
newServer.get().addedVersion,
ddEnabledState);
else
TraceEvent(SevWarn, "DDRecruitmentError").detail("Reason", "Server ID already recruited");
self->doBuildTeams = true;
UID id = newServer.get().interf.id();
if (!self->server_and_tss_info.count(id)) {
if (!recruitTss || tssState->tssRecruitSuccess()) {
self->addServer(newServer.get().interf,
candidateWorker.processClass,
self->serverTrackerErrorOut,
newServer.get().addedVersion,
ddEnabledState);
// signal all done after adding tss to tracking info
tssState->markComplete();
}
} else {
TraceEvent(SevWarn, "DDRecruitmentError")
.detail("Reason", "Server ID already recruited")
.detail("ServerID", id);
}
if (!recruitTss) {
self->doBuildTeams = true;
}
}
}
// SS and/or TSS recruitment failed at this point, update tssState
if (recruitTss && tssState->tssRecruitFailed()) {
tssState->markComplete();
TEST(true); // TSS recruitment failed for some reason
}
if (!recruitTss && tssState->ssRecruitFailed()) {
TEST(true); // SS with pair TSS recruitment failed for some reason
}
self->recruitingStream.set(self->recruitingStream.get() - 1);
self->restartRecruiting.trigger();
return Void();
}
// Recruit a worker as a storage server
ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
Reference<AsyncVar<struct ServerDBInfo>> db,
const DDEnabledState* ddEnabledState) {
@ -4536,13 +4792,49 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
state RecruitStorageRequest lastRequest;
state bool hasHealthyTeam;
state std::map<AddressExclusion, int> numSSPerAddr;
// tss-specific recruitment state
state int32_t targetTSSInDC = 0;
state int32_t tssToRecruit = 0;
state int inProgressTSSCount = 0;
state PromiseStream<Future<Void>> addTSSInProgress;
state Future<Void> inProgressTSS =
actorCollection(addTSSInProgress.getFuture(), &inProgressTSSCount, nullptr, nullptr, nullptr);
state Reference<TSSPairState> tssState = makeReference<TSSPairState>();
state Future<Void> checkTss = self->initialFailureReactionDelay;
state bool pendingTSSCheck = false;
TraceEvent(SevDebug, "TSS_RecruitUpdated", self->distributorId).detail("Count", tssToRecruit);
loop {
try {
// Divide TSS evenly in each DC if there are multiple
// TODO would it be better to put all of them in primary DC?
targetTSSInDC = self->configuration.desiredTSSCount;
if (self->configuration.usableRegions > 1) {
targetTSSInDC /= self->configuration.usableRegions;
if (self->primary) {
// put extras in primary DC if it's uneven
targetTSSInDC += (self->configuration.desiredTSSCount % self->configuration.usableRegions);
}
}
int newTssToRecruit = targetTSSInDC - self->tss_info_by_pair.size() - inProgressTSSCount;
if (newTssToRecruit != tssToRecruit) {
TraceEvent("TSS_RecruitUpdated", self->distributorId).detail("Count", newTssToRecruit);
tssToRecruit = newTssToRecruit;
// if we need to get rid of some TSS processes, signal to either cancel recruitment or kill existing TSS
// processes
if (!pendingTSSCheck && (tssToRecruit < 0 || self->zeroHealthyTeams->get()) &&
(self->isTssRecruiting || (self->zeroHealthyTeams->get() && self->tss_info_by_pair.size() > 0))) {
checkTss = self->initialFailureReactionDelay;
}
}
numSSPerAddr.clear();
hasHealthyTeam = (self->healthyTeamCount != 0);
RecruitStorageRequest rsr;
std::set<AddressExclusion> exclusions;
for (auto s = self->server_info.begin(); s != self->server_info.end(); ++s) {
for (auto s = self->server_and_tss_info.begin(); s != self->server_and_tss_info.end(); ++s) {
auto serverStatus = self->server_status.get(s->second->lastKnownInterface.id());
if (serverStatus.excludeOnRecruit()) {
TraceEvent(SevDebug, "DDRecruitExcl1")
@ -4574,7 +4866,7 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
exclusions.insert(addr);
}
rsr.criticalRecruitment = self->healthyTeamCount == 0;
rsr.criticalRecruitment = !hasHealthyTeam;
for (auto it : exclusions) {
rsr.excludeAddresses.push_back(it);
}
@ -4611,11 +4903,100 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
.detail("Addr", candidateSSAddr.toString())
.detail("NumExistingSS", numExistingSS);
}
self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState));
if (hasHealthyTeam && !tssState->active && tssToRecruit > 0) {
TraceEvent("TSS_Recruit", self->distributorId)
.detail("Stage", "HoldTSS")
.detail("Addr", candidateSSAddr.toString())
.detail("Locality", candidateWorker.worker.locality.toString());
TEST(true); // Starting TSS recruitment
self->isTssRecruiting = true;
tssState = makeReference<TSSPairState>(candidateWorker.worker.locality);
addTSSInProgress.send(tssState->waitComplete());
self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState, true, tssState));
checkTss = self->initialFailureReactionDelay;
} else {
if (tssState->active && tssState->inDataZone(candidateWorker.worker.locality)) {
TEST(true); // TSS recruits pair in same dc/datahall
self->isTssRecruiting = false;
TraceEvent("TSS_Recruit", self->distributorId)
.detail("Stage", "PairSS")
.detail("Addr", candidateSSAddr.toString())
.detail("Locality", candidateWorker.worker.locality.toString());
self->addActor.send(
initializeStorage(self, candidateWorker, ddEnabledState, false, tssState));
// successfully started recruitment of pair, reset tss recruitment state
tssState = makeReference<TSSPairState>();
} else {
TEST(tssState->active); // TSS recruitment skipped potential pair because it's in a
// different dc/datahall
self->addActor.send(initializeStorage(
self, candidateWorker, ddEnabledState, false, makeReference<TSSPairState>()));
}
}
}
when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface changes?
fCandidateWorker = Future<RecruitStorageReply>();
}
when(wait(self->zeroHealthyTeams->onChange())) {
if (!pendingTSSCheck && self->zeroHealthyTeams->get() &&
(self->isTssRecruiting || self->tss_info_by_pair.size() > 0)) {
checkTss = self->initialFailureReactionDelay;
}
}
when(wait(checkTss)) {
bool cancelTss = self->isTssRecruiting && (tssToRecruit < 0 || self->zeroHealthyTeams->get());
// Can't kill more tss' than we have. Kill 1 if zero healthy teams, otherwise kill enough to get
// back to the desired amount
int tssToKill = std::min((int)self->tss_info_by_pair.size(),
std::max(-tssToRecruit, self->zeroHealthyTeams->get() ? 1 : 0));
if (cancelTss) {
TEST(tssToRecruit < 0); // tss recruitment cancelled due to too many TSS
TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams
TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
.detail("Reason", tssToRecruit <= 0 ? "TooMany" : "ZeroHealthyTeams");
tssState->cancel();
tssState = makeReference<TSSPairState>();
self->isTssRecruiting = false;
pendingTSSCheck = true;
checkTss = delay(SERVER_KNOBS->TSS_DD_CHECK_INTERVAL);
} else if (tssToKill > 0) {
auto itr = self->tss_info_by_pair.begin();
for (int i = 0; i < tssToKill; i++, itr++) {
UID tssId = itr->second->id;
StorageServerInterface tssi = itr->second->lastKnownInterface;
if (self->shouldHandleServer(tssi) && self->server_and_tss_info.count(tssId)) {
Promise<Void> killPromise = itr->second->killTss;
if (killPromise.canBeSet()) {
TEST(tssToRecruit < 0); // Killing TSS due to too many TSS
TEST(self->zeroHealthyTeams->get()); // Killing TSS due zero healthy teams
TraceEvent(SevWarn, "TSS_DDKill", self->distributorId)
.detail("TSSID", tssId)
.detail("Reason",
self->zeroHealthyTeams->get() ? "ZeroHealthyTeams" : "TooMany");
killPromise.send(Void());
}
}
}
// If we're killing a TSS because of zero healthy teams, wait a bit to give the replacing SS a
// change to join teams and stuff before killing another TSS
pendingTSSCheck = true;
checkTss = delay(SERVER_KNOBS->TSS_DD_CHECK_INTERVAL);
} else if (self->isTssRecruiting) {
// check again later in case we need to cancel recruitment
pendingTSSCheck = true;
checkTss = delay(SERVER_KNOBS->TSS_DD_CHECK_INTERVAL);
// FIXME: better way to do this than timer?
} else {
pendingTSSCheck = false;
checkTss = Never();
}
}
when(wait(self->restartRecruiting.onTrigger())) {}
}
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskPriority::DataDistribution));
@ -4760,6 +5141,13 @@ ACTOR Future<Void> dataDistributionTeamCollection(Reference<DDTeamCollection> te
self->restartRecruiting.trigger();
}
when(UID removedTSS = waitNext(self->removedTSS.getFuture())) {
TEST(true); // TSS removed from database
self->removeTSS(removedTSS);
serverRemoved.send(Void());
self->restartRecruiting.trigger();
}
when(wait(self->zeroHealthyTeams->onChange())) {
if (self->zeroHealthyTeams->get()) {
self->restartRecruiting.trigger();
@ -5265,11 +5653,13 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) {
TraceEvent("RemoveFailedServer", removeFailedServer.getFuture().get()).error(err);
wait(removeKeysFromFailedServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState));
wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState));
Optional<UID> tssPairID;
wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), tssPairID, lock, ddEnabledState));
} else {
if (err.code() != error_code_movekeys_conflict) {
throw err;
}
bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
TraceEvent("DataDistributionMoveKeysConflict").detail("DataDistributionEnabled", ddEnabled).error(err);
if (ddEnabled) {
@ -5920,4 +6310,4 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
ASSERT(result == 8);
return Void();
}
}

View File

@ -371,9 +371,9 @@ public:
const T* upperBound() const { return upper; }
DeltaTree* tree;
Arena arena;
private:
Arena arena;
DecodedNode* root;
const T* lower;
const T* upper;

View File

@ -148,7 +148,10 @@ ACTOR Future<int> spawnProcess(std::string path,
state pid_t pid = pidAndReadFD.first;
state Optional<int> readFD = pidAndReadFD.second;
if (pid == -1) {
TraceEvent(SevWarnAlways, "SpawnProcess: Command failed to spawn").detail("Cmd", path).detail("Args", allArgs);
TraceEvent(SevWarnAlways, "SpawnProcessFailure")
.detail("Reason", "Command failed to spawn")
.detail("Cmd", path)
.detail("Args", allArgs);
return -1;
} else if (pid > 0) {
state int status = -1;
@ -160,7 +163,8 @@ ACTOR Future<int> spawnProcess(std::string path,
if (runTime > maxWaitTime) {
// timing out
TraceEvent(SevWarnAlways, "SpawnProcess : Command failed, timeout")
TraceEvent(SevWarnAlways, "SpawnProcessFailure")
.detail("Reason", "Command failed, timeout")
.detail("Cmd", path)
.detail("Args", allArgs);
return -1;
@ -175,9 +179,10 @@ ACTOR Future<int> spawnProcess(std::string path,
}
if (err < 0) {
TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed");
TraceEvent event(SevWarnAlways, "SpawnProcessFailure");
setupTraceWithOutput(event, bytesRead, outputBuffer);
event.detail("Cmd", path)
event.detail("Reason", "Command failed")
.detail("Cmd", path)
.detail("Args", allArgs)
.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
return -1;
@ -194,14 +199,15 @@ ACTOR Future<int> spawnProcess(std::string path,
} else {
// child process completed
if (!(WIFEXITED(status) && WEXITSTATUS(status) == 0)) {
TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed");
TraceEvent event(SevWarnAlways, "SpawnProcessFailure");
setupTraceWithOutput(event, bytesRead, outputBuffer);
event.detail("Cmd", path)
event.detail("Reason", "Command failed")
.detail("Cmd", path)
.detail("Args", allArgs)
.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
}
TraceEvent event("SpawnProcess : Command status");
TraceEvent event("SpawnProcessCommandStatus");
setupTraceWithOutput(event, bytesRead, outputBuffer);
event.detail("Cmd", path)
.detail("Args", allArgs)

View File

@ -26,6 +26,7 @@
#include "flow/flow.h"
#include "fdbclient/FDBTypes.h"
#include "flow/crc32c.h"
#ifndef VALGRIND
#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
@ -36,39 +37,98 @@ typedef uint32_t LogicalPageID;
typedef uint32_t PhysicalPageID;
#define invalidLogicalPageID std::numeric_limits<LogicalPageID>::max()
class IPage {
// Represents a block of memory in a 4096-byte aligned location held by an Arena.
class ArenaPage : public ReferenceCounted<ArenaPage>, public FastAllocated<ArenaPage> {
public:
IPage() : userData(nullptr) {}
// The page's logical size includes an opaque checksum, use size() to get usable size
ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
if (bufferSize > 0) {
buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize);
virtual uint8_t const* begin() const = 0;
virtual uint8_t* mutate() = 0;
// Mark any unused page portion defined
VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
} else {
buffer = nullptr;
}
};
// Must return the same size for all pages created by the same pager instance
virtual int size() const = 0;
StringRef asStringRef() const { return StringRef(begin(), size()); }
virtual ~IPage() {
~ArenaPage() {
if (userData != nullptr && userDataDestructor != nullptr) {
userDataDestructor(userData);
}
}
virtual Reference<IPage> clone() const = 0;
uint8_t const* begin() const { return (uint8_t*)buffer; }
virtual void addref() const = 0;
virtual void delref() const = 0;
uint8_t* mutate() { return (uint8_t*)buffer; }
typedef uint32_t Checksum;
// Usable size, without checksum
int size() const { return logicalSize - sizeof(Checksum); }
Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(begin(), size()), arena); }
// Get an ArenaPage which is a copy of this page, in its own Arena
Reference<ArenaPage> cloneContents() const {
ArenaPage* p = new ArenaPage(logicalSize, bufferSize);
memcpy(p->buffer, buffer, logicalSize);
return Reference<ArenaPage>(p);
}
// Get an ArenaPage which depends on this page's Arena and references some of its memory
Reference<ArenaPage> subPage(int offset, int len) const {
ArenaPage* p = new ArenaPage(len, 0);
p->buffer = buffer + offset;
p->arena.dependsOn(arena);
return Reference<ArenaPage>(p);
}
// Given a vector of pages with the same ->size(), create a new ArenaPage with a ->size() that is
// equivalent to all of the input pages and has all of their contents copied into it.
static Reference<ArenaPage> concatPages(const std::vector<Reference<const ArenaPage>>& pages) {
int usableSize = pages.front()->size();
int totalUsableSize = pages.size() * usableSize;
int totalBufferSize = pages.front()->bufferSize * pages.size();
ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
uint8_t* wptr = superpage->mutate();
for (auto& p : pages) {
ASSERT(p->size() == usableSize);
memcpy(wptr, p->begin(), usableSize);
wptr += usableSize;
}
return Reference<ArenaPage>(superpage);
}
Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }
Checksum calculateChecksum(LogicalPageID pageID) { return crc32c_append(pageID, buffer, size()); }
void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); }
bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
const Arena& getArena() const { return arena; }
private:
Arena arena;
int logicalSize;
int bufferSize;
uint8_t* buffer;
public:
mutable void* userData;
mutable void (*userDataDestructor)(void*);
};
class IPagerSnapshot {
public:
virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID,
bool cacheable,
bool nohit,
bool* fromCache = nullptr) = 0;
virtual Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID,
bool cacheable,
bool nohit,
bool* fromCache = nullptr) = 0;
virtual bool tryEvictPage(LogicalPageID id) = 0;
virtual Version getVersion() const = 0;
@ -83,8 +143,8 @@ public:
// This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic.
class IPager2 : public IClosable {
public:
// Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed.
virtual Reference<IPage> newPageBuffer() = 0;
// Returns an ArenaPage that can be passed to writePage. The data in the returned ArenaPage might not be zeroed.
virtual Reference<ArenaPage> newPageBuffer() = 0;
// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
// For a given pager instance, separate calls to this function must return the same value.
@ -98,13 +158,13 @@ public:
// Replace the contents of a page with new data across *all* versions.
// Existing holders of a page reference for pageID, read from any version,
// may see the effects of this write.
virtual void updatePage(LogicalPageID pageID, Reference<IPage> data) = 0;
virtual void updatePage(LogicalPageID pageID, Reference<ArenaPage> data) = 0;
// Try to atomically update the contents of a page as of version v in the next commit.
// If the pager is unable to do this at this time, it may choose to write the data to a new page ID
// instead and return the new page ID to the caller. Otherwise the original pageID argument will be returned.
// If a new page ID is returned, the old page ID will be freed as of version v
virtual Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<IPage> data, Version v) = 0;
virtual Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<ArenaPage> data, Version v) = 0;
// Free pageID to be used again after the commit that moves oldestVersion past v
virtual void freePage(LogicalPageID pageID, Version v) = 0;
@ -120,10 +180,10 @@ public:
// Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read.
// NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are
// considered likely to be needed soon.
virtual Future<Reference<IPage>> readPage(LogicalPageID pageID,
bool cacheable = true,
bool noHit = false,
bool* fromCache = nullptr) = 0;
virtual Future<Reference<ArenaPage>> readPage(LogicalPageID pageID,
bool cacheable = true,
bool noHit = false,
bool* fromCache = nullptr) = 0;
// Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion()
// Note that snapshots at any version may still see the results of updatePage() calls.

View File

@ -217,6 +217,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( SERVER_LIST_DELAY, 1.0 );
init( RECRUITMENT_IDLE_DELAY, 1.0 );
init( STORAGE_RECRUITMENT_DELAY, 10.0 );
init( TSS_HACK_IDENTITY_MAPPING, false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing
init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail
init( TSS_DD_CHECK_INTERVAL, 60.0 ); if (randomize && BUGGIFY ) TSS_DD_CHECK_INTERVAL = 1.0; // May kill all TSS quickly
init( DATA_DISTRIBUTION_LOGGING_INTERVAL, 5.0 );
init( DD_ENABLED_CHECK_DELAY, 1.0 );
init( DD_STALL_CHECK_DELAY, 0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY
@ -631,6 +634,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
// Coordination
init( COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL, 1.0 ); if( randomize && BUGGIFY ) COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL = 10.0;
init( ENABLE_CROSS_CLUSTER_SUPPORT, true ); if( randomize && BUGGIFY ) ENABLE_CROSS_CLUSTER_SUPPORT = false;
// Buggification
init( BUGGIFIED_EVENTUAL_CONSISTENCY, 1.0 );

View File

@ -167,6 +167,9 @@ public:
double SERVER_LIST_DELAY;
double RECRUITMENT_IDLE_DELAY;
double STORAGE_RECRUITMENT_DELAY;
bool TSS_HACK_IDENTITY_MAPPING;
double TSS_RECRUITMENT_TIMEOUT;
double TSS_DD_CHECK_INTERVAL;
double DATA_DISTRIBUTION_LOGGING_INTERVAL;
double DD_ENABLED_CHECK_DELAY;
double DD_STALL_CHECK_DELAY;
@ -559,6 +562,8 @@ public:
// Coordination
double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL;
bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match
// the local descriptor
// Buggification
double BUGGIFIED_EVENTUAL_CONSISTENCY;

View File

@ -410,6 +410,8 @@ struct ILogSystem {
virtual Optional<UID> getPrimaryPeekLocation() const = 0;
virtual Optional<UID> getCurrentPeekLocation() const = 0;
virtual void addref() = 0;
virtual void delref() = 0;
@ -473,6 +475,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<ServerPeekCursor>::addref(); }
@ -534,6 +537,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MergedPeekCursor>::addref(); }
@ -589,6 +593,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<SetPeekCursor>::addref(); }
@ -620,6 +625,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MultiCursor>::addref(); }
@ -698,6 +704,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<BufferedCursor>::addref(); }

View File

@ -393,12 +393,16 @@ Version ILogSystem::ServerPeekCursor::getMinKnownCommittedVersion() const {
}
Optional<UID> ILogSystem::ServerPeekCursor::getPrimaryPeekLocation() const {
if (interf) {
if (interf && interf->get().present()) {
return interf->get().id();
}
return Optional<UID>();
}
Optional<UID> ILogSystem::ServerPeekCursor::getCurrentPeekLocation() const {
return ILogSystem::ServerPeekCursor::getPrimaryPeekLocation();
}
Version ILogSystem::ServerPeekCursor::popped() const {
return poppedVersion;
}
@ -673,6 +677,13 @@ Optional<UID> ILogSystem::MergedPeekCursor::getPrimaryPeekLocation() const {
return Optional<UID>();
}
Optional<UID> ILogSystem::MergedPeekCursor::getCurrentPeekLocation() const {
if (currentCursor >= 0) {
return serverCursors[currentCursor]->getPrimaryPeekLocation();
}
return Optional<UID>();
}
Version ILogSystem::MergedPeekCursor::popped() const {
Version poppedVersion = 0;
for (auto& c : serverCursors)
@ -1023,6 +1034,13 @@ Optional<UID> ILogSystem::SetPeekCursor::getPrimaryPeekLocation() const {
return Optional<UID>();
}
Optional<UID> ILogSystem::SetPeekCursor::getCurrentPeekLocation() const {
if (currentCursor >= 0 && currentSet >= 0) {
return serverCursors[currentSet][currentCursor]->getPrimaryPeekLocation();
}
return Optional<UID>();
}
Version ILogSystem::SetPeekCursor::popped() const {
Version poppedVersion = 0;
for (auto& cursors : serverCursors) {
@ -1123,6 +1141,10 @@ Optional<UID> ILogSystem::MultiCursor::getPrimaryPeekLocation() const {
return cursors.back()->getPrimaryPeekLocation();
}
Optional<UID> ILogSystem::MultiCursor::getCurrentPeekLocation() const {
return cursors.back()->getCurrentPeekLocation();
}
Version ILogSystem::MultiCursor::popped() const {
return std::max(poppedVersion, cursors.back()->popped());
}
@ -1403,6 +1425,10 @@ Optional<UID> ILogSystem::BufferedCursor::getPrimaryPeekLocation() const {
return Optional<UID>();
}
Optional<UID> ILogSystem::BufferedCursor::getCurrentPeekLocation() const {
return Optional<UID>();
}
Version ILogSystem::BufferedCursor::popped() const {
if (initialPoppedVersion == poppedVersion) {
return 0;

View File

@ -20,9 +20,11 @@
#include "flow/Util.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbclient/KeyBackedTypes.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/TSSMappingUtil.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using std::max;
@ -158,7 +160,7 @@ ACTOR Future<Optional<UID>> checkReadWrite(Future<ErrorOr<GetShardStateReply>> f
return Optional<UID>(uid);
}
Future<Void> removeOldDestinations(Transaction* tr,
Future<Void> removeOldDestinations(Reference<ReadYourWritesTransaction> tr,
UID oldDest,
VectorRef<KeyRangeRef> shards,
KeyRangeRef currentKeys) {
@ -235,7 +237,7 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard,
}
ACTOR Future<vector<vector<UID>>> additionalSources(RangeResult shards,
Transaction* tr,
Reference<ReadYourWritesTransaction> tr,
int desiredHealthy,
int maxServers) {
state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
@ -320,6 +322,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
MoveKeysLock lock,
FlowLock* startMoveKeysLock,
UID relocationIntervalId,
std::map<UID, StorageServerInterface>* tssMapping,
const DDEnabledState* ddEnabledState) {
state TraceInterval interval("RelocateShard_StartMoveKeys");
state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
@ -327,6 +330,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
wait(startMoveKeysLock->take(TaskPriority::DataDistributionLaunch));
state FlowLock::Releaser releaser(*startMoveKeysLock);
state bool loadedTssMapping = false;
TraceEvent(SevDebug, interval.begin(), relocationIntervalId);
@ -343,7 +347,8 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
TEST(begin > keys.begin); // Multi-transactional startMoveKeys
batches++;
state Transaction tr(occ);
// RYW to optimize re-reading the same key ranges
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(occ);
state int retries = 0;
loop {
@ -356,15 +361,22 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// Keep track of shards for all src servers so that we can preserve their values in serverKeys
state Map<UID, VectorRef<KeyRangeRef>> shardMap;
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->getTransaction().info.taskID = TaskPriority::MoveKeys;
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState));
if (!loadedTssMapping) {
// share transaction for loading tss mapping with the rest of start move keys
wait(readTSSMappingRYW(tr, tssMapping));
loadedTssMapping = true;
}
vector<Future<Optional<Value>>> serverListEntries;
serverListEntries.reserve(servers.size());
for (int s = 0; s < servers.size(); s++)
serverListEntries.push_back(tr.get(serverListKeyFor(servers[s])));
serverListEntries.push_back(tr->get(serverListKeyFor(servers[s])));
state vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
for (int s = 0; s < serverListValues.size(); s++) {
@ -380,7 +392,8 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// Get all existing shards overlapping keys (exclude any that have been processed in a previous
// iteration of the outer loop)
state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
state RangeResult old = wait(krmGetRanges(&tr,
state RangeResult old = wait(krmGetRanges(tr,
keyServersPrefix,
currentKeys,
SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
@ -399,10 +412,10 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());
// Check that enough servers for each shard are in the correct state
state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
vector<vector<UID>> addAsSource = wait(additionalSources(
old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
old, tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
// For each intersecting range, update keyServers[range] dest to be servers and clear existing dest
// servers from serverKeys
@ -417,7 +430,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// .detail("KeyEnd", rangeIntersectKeys.end.toString())
// .detail("OldSrc", describe(src))
// .detail("OldDest", describe(dest))
// .detail("ReadVersion", tr.getReadVersion().get());
// .detail("ReadVersion", tr->getReadVersion().get());
for (auto& uid : addAsSource[i]) {
src.push_back(uid);
@ -425,7 +438,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
uniquify(src);
// Update dest servers for this range to be equal to servers
krmSetPreviouslyEmptyRange(&tr,
krmSetPreviouslyEmptyRange(&(tr->getTransaction()),
keyServersPrefix,
rangeIntersectKeys,
keyServersValue(UIDtoTagMap, src, servers),
@ -455,7 +468,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
vector<Future<Void>> actors;
for (oldDest = oldDests.begin(); oldDest != oldDests.end(); ++oldDest)
if (std::find(servers.begin(), servers.end(), *oldDest) == servers.end())
actors.push_back(removeOldDestinations(&tr, *oldDest, shardMap[*oldDest], currentKeys));
actors.push_back(removeOldDestinations(tr, *oldDest, shardMap[*oldDest], currentKeys));
// Update serverKeys to include keys (or the currently processed subset of keys) for each SS in
// servers
@ -464,12 +477,12 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// to have the same shard boundaries If that invariant was important, we would have to move this
// inside the loop above and also set it for the src servers
actors.push_back(krmSetRangeCoalescing(
&tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue));
tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue));
}
wait(waitForAll(actors));
wait(tr.commit());
wait(tr->commit());
/*TraceEvent("StartMoveKeysCommitDone", relocationIntervalId)
.detail("CommitVersion", tr.getCommittedVersion())
@ -481,7 +494,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
state Error err = e;
if (err.code() == error_code_move_to_removed_server)
throw;
wait(tr.onError(e));
wait(tr->onError(e));
if (retries % 10 == 0) {
TraceEvent(
@ -500,7 +513,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
}
// printf("Committed moving '%s'-'%s' (version %lld)\n", keys.begin.toString().c_str(),
// keys.end.toString().c_str(), tr.getCommittedVersion());
// keys.end.toString().c_str(), tr->getCommittedVersion());
TraceEvent(SevDebug, interval.end(), relocationIntervalId)
.detail("Batches", batches)
.detail("Shards", shards)
@ -536,11 +549,14 @@ ACTOR Future<Void> waitForShardReady(StorageServerInterface server,
}
}
// best effort to also wait for TSS on data move
ACTOR Future<Void> checkFetchingState(Database cx,
vector<UID> dest,
KeyRange keys,
Promise<Void> dataMovementComplete,
UID relocationIntervalId) {
UID relocationIntervalId,
std::map<UID, StorageServerInterface> tssMapping) {
state Transaction tr(cx);
loop {
@ -557,6 +573,7 @@ ACTOR Future<Void> checkFetchingState(Database cx,
serverListEntries.push_back(tr.get(serverListKeyFor(dest[s])));
state vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
vector<Future<Void>> requests;
state vector<Future<Void>> tssRequests;
for (int s = 0; s < serverListValues.size(); s++) {
if (!serverListValues[s].present()) {
// FIXME: Is this the right behavior? dataMovementComplete will never be sent!
@ -567,10 +584,25 @@ ACTOR Future<Void> checkFetchingState(Database cx,
ASSERT(si.id() == dest[s]);
requests.push_back(
waitForShardReady(si, keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING));
auto tssPair = tssMapping.find(si.id());
if (tssPair != tssMapping.end()) {
tssRequests.push_back(waitForShardReady(
tssPair->second, keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING));
}
}
wait(timeoutError(waitForAll(requests), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys));
// If normal servers return normally, give TSS data movement a bit of a chance, but don't block on it, and
// ignore errors in tss requests
if (tssRequests.size()) {
wait(timeout(waitForAllReady(tssRequests),
SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT / 2,
Void(),
TaskPriority::MoveKeys));
}
dataMovementComplete.send(Void());
return Void();
} catch (Error& e) {
@ -593,6 +625,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
FlowLock* finishMoveKeysParallelismLock,
bool hasRemote,
UID relocationIntervalId,
std::map<UID, StorageServerInterface> tssMapping,
const DDEnabledState* ddEnabledState) {
state TraceInterval interval("RelocateShard_FinishMoveKeys");
state TraceInterval waitInterval("");
@ -602,6 +635,11 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
state int retries = 0;
state FlowLock::Releaser releaser;
state std::vector<std::pair<UID, UID>> tssToKill;
state std::unordered_set<UID> tssToIgnore;
// try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster
state int waitForTSSCounter = 2;
ASSERT(!destinationTeam.empty());
try {
@ -616,9 +654,26 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
state Transaction tr(occ);
// printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
// printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str());
loop {
try {
if (tssToKill.size()) {
TEST(true); // killing TSS because they were unavailable for movekeys
// Kill tss BEFORE committing main txn so that client requests don't make it to the tss when it
// has a different shard set than its pair use a different RYW transaction since i'm too lazy
// (and don't want to add bugs) by changing whole method to RYW. Also, using a different
// transaction makes it commit earlier which we may need to guarantee causality of tss getting
// removed before client sends a request to this key range on the new SS
wait(removeTSSPairsFromCluster(occ, tssToKill));
for (auto& tssPair : tssToKill) {
TraceEvent(SevWarnAlways, "TSS_KillMoveKeys").detail("TSSID", tssPair.second);
tssToIgnore.insert(tssPair.second);
}
tssToKill.clear();
}
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@ -763,6 +818,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
// between
// now and when this transaction commits.
state vector<Future<Void>> serverReady; // only for count below
state vector<Future<Void>> tssReady; // for waiting in parallel with tss
state vector<StorageServerInterface> tssReadyInterfs;
state vector<UID> newDestinations;
std::set<UID> completeSrcSet(completeSrc.begin(), completeSrc.end());
for (auto& it : dest) {
@ -789,22 +846,95 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
storageServerInterfaces.push_back(si);
}
// update client info in case tss mapping changed or server got updated
// Wait for new destination servers to fetch the keys
serverReady.reserve(storageServerInterfaces.size());
for (int s = 0; s < storageServerInterfaces.size(); s++)
tssReady.reserve(storageServerInterfaces.size());
tssReadyInterfs.reserve(storageServerInterfaces.size());
for (int s = 0; s < storageServerInterfaces.size(); s++) {
serverReady.push_back(waitForShardReady(storageServerInterfaces[s],
keys,
tr.getReadVersion().get(),
GetShardStateRequest::READABLE));
wait(timeout(waitForAll(serverReady),
auto tssPair = tssMapping.find(storageServerInterfaces[s].id());
if (tssPair != tssMapping.end() && waitForTSSCounter > 0 &&
!tssToIgnore.count(tssPair->second.id())) {
tssReadyInterfs.push_back(tssPair->second);
tssReady.push_back(waitForShardReady(
tssPair->second, keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE));
}
}
// Wait for all storage server moves, and explicitly swallow errors for tss ones with
// waitForAllReady If this takes too long the transaction will time out and retry, which is ok
wait(timeout(waitForAll(serverReady) && waitForAllReady(tssReady),
SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT,
Void(),
TaskPriority::MoveKeys));
// Check to see if we're waiting only on tss. If so, decrement the waiting counter.
// If the waiting counter is zero, kill the slow/non-responsive tss processes before finalizing the
// data move.
if (tssReady.size()) {
bool allSSDone = true;
for (auto& f : serverReady) {
allSSDone &= f.isReady() && !f.isError();
if (!allSSDone) {
break;
}
}
if (allSSDone) {
bool anyTssNotDone = false;
for (auto& f : tssReady) {
if (!f.isReady() || f.isError()) {
anyTssNotDone = true;
waitForTSSCounter--;
break;
}
}
if (anyTssNotDone && waitForTSSCounter == 0) {
for (int i = 0; i < tssReady.size(); i++) {
if (!tssReady[i].isReady() || tssReady[i].isError()) {
tssToKill.push_back(
std::pair(tssReadyInterfs[i].tssPairID.get(), tssReadyInterfs[i].id()));
}
}
// repeat loop and go back to start to kill tss' before continuing on
continue;
}
}
}
int count = dest.size() - newDestinations.size();
for (int s = 0; s < serverReady.size(); s++)
count += serverReady[s].isReady() && !serverReady[s].isError();
// printf(" fMK: moved data to %d/%d servers\n", count, serverReady.size());
int tssCount = 0;
for (int s = 0; s < tssReady.size(); s++)
tssCount += tssReady[s].isReady() && !tssReady[s].isError();
/*if (tssReady.size()) {
printf(" fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n",
begin.toString().c_str(),
keys.end.toString().c_str(),
count,
serverReady.size(),
tssCount,
tssReady.size());
} else {
printf(" fMK: [%s - %s) moved data to %d/%d servers\n",
begin.toString().c_str(),
keys.end.toString().c_str(),
count,
serverReady.size());
}*/
TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count);
if (count == dest.size()) {
@ -862,43 +992,48 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
}
ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServerInterface server) {
state Transaction tr(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
state int maxSkipTags = 1;
loop {
try {
state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> fv = tr.get(serverListKeyFor(server.id()));
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Future<Optional<Value>> fExclProc = tr.get(
// FIXME: don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag
state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> fv = tr->get(serverListKeyFor(server.id()));
state Future<Optional<Value>> fExclProc = tr->get(
StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip, server.address().port))));
state Future<Optional<Value>> fExclIP =
tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip))));
state Future<Optional<Value>> fFailProc =
tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port))));
tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip))));
state Future<Optional<Value>> fFailProc = tr->get(
StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port))));
state Future<Optional<Value>> fFailIP =
tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip))));
tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip))));
state Future<Optional<Value>> fExclProc2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeExcludedServersKey(
? tr->get(StringRef(encodeExcludedServersKey(
AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port))))
: Future<Optional<Value>>(Optional<Value>());
state Future<Optional<Value>> fExclIP2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
? tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
: Future<Optional<Value>>(Optional<Value>());
state Future<Optional<Value>> fFailProc2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeFailedServersKey(
? tr->get(StringRef(encodeFailedServersKey(
AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port))))
: Future<Optional<Value>>(Optional<Value>());
state Future<Optional<Value>> fFailIP2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
? tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
: Future<Optional<Value>>(Optional<Value>());
state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<RangeResult> fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<RangeResult> fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
wait(success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) &&
success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) &&
@ -914,63 +1049,90 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
if (fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more)
ASSERT(false);
int8_t maxTagLocality = 0;
state int8_t locality = -1;
for (auto& kv : fTagLocalities.get()) {
int8_t loc = decodeTagLocalityListValue(kv.value);
if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) {
locality = loc;
break;
}
maxTagLocality = std::max(maxTagLocality, loc);
}
if (locality == -1) {
locality = maxTagLocality + 1;
if (locality < 0)
throw recruitment_failed();
tr.set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality));
}
int skipTags = deterministicRandom()->randomInt(0, maxSkipTags);
state uint16_t tagId = 0;
std::vector<uint16_t> usedTags;
for (auto& it : fTags.get()) {
Tag t = decodeServerTagValue(it.value);
if (t.locality == locality) {
usedTags.push_back(t.id);
}
}
for (auto& it : fHistoryTags.get()) {
Tag t = decodeServerTagValue(it.value);
if (t.locality == locality) {
usedTags.push_back(t.id);
}
}
std::sort(usedTags.begin(), usedTags.end());
int usedIdx = 0;
for (; usedTags.size() > 0 && tagId <= usedTags.end()[-1]; tagId++) {
if (tagId < usedTags[usedIdx]) {
if (skipTags == 0)
state Tag tag;
if (server.isTss()) {
bool foundTag = false;
for (auto& it : fTags.get()) {
UID key = decodeServerTagKey(it.key);
if (key == server.tssPairID.get()) {
tag = decodeServerTagValue(it.value);
foundTag = true;
break;
skipTags--;
} else {
usedIdx++;
}
}
if (!foundTag) {
throw recruitment_failed();
}
tssMapDB.set(tr, server.tssPairID.get(), server.id());
} else {
int8_t maxTagLocality = 0;
state int8_t locality = -1;
for (auto& kv : fTagLocalities.get()) {
int8_t loc = decodeTagLocalityListValue(kv.value);
if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) {
locality = loc;
break;
}
maxTagLocality = std::max(maxTagLocality, loc);
}
if (locality == -1) {
locality = maxTagLocality + 1;
if (locality < 0) {
throw recruitment_failed();
}
tr->set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality));
}
int skipTags = deterministicRandom()->randomInt(0, maxSkipTags);
state uint16_t tagId = 0;
std::vector<uint16_t> usedTags;
for (auto& it : fTags.get()) {
Tag t = decodeServerTagValue(it.value);
if (t.locality == locality) {
usedTags.push_back(t.id);
}
}
for (auto& it : fHistoryTags.get()) {
Tag t = decodeServerTagValue(it.value);
if (t.locality == locality) {
usedTags.push_back(t.id);
}
}
std::sort(usedTags.begin(), usedTags.end());
int usedIdx = 0;
for (; usedTags.size() > 0 && tagId <= usedTags.end()[-1]; tagId++) {
if (tagId < usedTags[usedIdx]) {
if (skipTags == 0)
break;
skipTags--;
} else {
usedIdx++;
}
}
tagId += skipTags;
tag = Tag(locality, tagId);
tr->set(serverTagKeyFor(server.id()), serverTagValue(tag));
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
tr->addReadConflictRange(conflictRange);
tr->addWriteConflictRange(conflictRange);
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
TraceEvent(SevError, "TSSIdentityMappingEnabled");
tssMapDB.set(tr, server.id(), server.id());
}
}
tagId += skipTags;
state Tag tag(locality, tagId);
tr.set(serverTagKeyFor(server.id()), serverTagValue(tag));
tr.set(serverListKeyFor(server.id()), serverListValue(server));
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
tr.addReadConflictRange(conflictRange);
tr.addWriteConflictRange(conflictRange);
wait(tr.commit());
return std::make_pair(tr.getCommittedVersion(), tag);
tr->set(serverListKeyFor(server.id()), serverListValue(server));
wait(tr->commit());
return std::make_pair(tr->getCommittedVersion(), tag);
} catch (Error& e) {
if (e.code() == error_code_commit_unknown_result)
throw recruitment_failed(); // There is a remote possibility that we successfully added ourselves and
@ -980,12 +1142,12 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
maxSkipTags = SERVER_KNOBS->MAX_SKIP_TAGS;
}
wait(tr.onError(e));
wait(tr->onError(e));
}
}
}
// A SS can be removed only if all data (shards) on the SS have been moved away from the SS.
ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
ACTOR Future<bool> canRemoveStorageServer(Reference<ReadYourWritesTransaction> tr, UID serverID) {
RangeResult keys = wait(krmGetRanges(tr, serverKeysPrefixFor(serverID), allKeys, 2));
ASSERT(keys.size() >= 2);
@ -1005,34 +1167,37 @@ ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
ACTOR Future<Void> removeStorageServer(Database cx,
UID serverID,
Optional<UID> tssPairID,
MoveKeysLock lock,
const DDEnabledState* ddEnabledState) {
state Transaction tr(cx);
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state bool retry = false;
state int noCanRemoveCount = 0;
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState));
TraceEvent("RemoveStorageServerLocked")
.detail("ServerID", serverID)
.detail("Version", tr.getReadVersion().get());
.detail("Version", tr->getReadVersion().get());
state bool canRemove = wait(canRemoveStorageServer(&tr, serverID));
state bool canRemove = wait(canRemoveStorageServer(tr, serverID));
if (!canRemove) {
TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to
// reverse its mistake.
TraceEvent(SevWarn, "NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch));
tr.reset();
tr->reset();
TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove);
} else {
state Future<Optional<Value>> fListKey = tr.get(serverListKeyFor(serverID));
state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTLogDatacenters = tr.getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> fListKey = tr->get(serverListKeyFor(serverID));
state Future<RangeResult> fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTLogDatacenters = tr->getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
wait(success(fListKey) && success(fTags) && success(fHistoryTags) && success(fTagLocalities) &&
success(fTLogDatacenters));
@ -1072,22 +1237,32 @@ ACTOR Future<Void> removeStorageServer(Database cx,
if (locality >= 0 && !allLocalities.count(locality)) {
for (auto& it : fTagLocalities.get()) {
if (locality == decodeTagLocalityListValue(it.value)) {
tr.clear(it.key);
tr->clear(it.key);
break;
}
}
}
tr.clear(serverListKeyFor(serverID));
tr.clear(serverTagKeyFor(serverID));
tr.clear(serverTagHistoryRangeFor(serverID));
tr->clear(serverListKeyFor(serverID));
tr->clear(serverTagKeyFor(serverID)); // A tss uses this to communicate shutdown but it never has a
// server tag key set in the first place
tr->clear(serverTagHistoryRangeFor(serverID));
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
TraceEvent(SevError, "TSSIdentityMappingEnabled");
tssMapDB.erase(tr, serverID);
} else if (tssPairID.present()) {
tssMapDB.erase(tr, tssPairID.get());
}
retry = true;
wait(tr.commit());
wait(tr->commit());
return Void();
}
} catch (Error& e) {
state Error err = e;
wait(tr.onError(e));
wait(tr->onError(e));
TraceEvent("RemoveStorageServerRetrying").error(err);
}
}
@ -1180,11 +1355,20 @@ ACTOR Future<Void> moveKeys(Database cx,
const DDEnabledState* ddEnabledState) {
ASSERT(destinationTeam.size());
std::sort(destinationTeam.begin(), destinationTeam.end());
wait(startMoveKeys(
cx, keys, destinationTeam, lock, startMoveKeysParallelismLock, relocationIntervalId, ddEnabledState));
state std::map<UID, StorageServerInterface> tssMapping;
wait(startMoveKeys(cx,
keys,
destinationTeam,
lock,
startMoveKeysParallelismLock,
relocationIntervalId,
&tssMapping,
ddEnabledState));
state Future<Void> completionSignaller =
checkFetchingState(cx, healthyDestinations, keys, dataMovementComplete, relocationIntervalId);
checkFetchingState(cx, healthyDestinations, keys, dataMovementComplete, relocationIntervalId, tssMapping);
wait(finishMoveKeys(cx,
keys,
@ -1193,6 +1377,7 @@ ACTOR Future<Void> moveKeys(Database cx,
finishMoveKeysParallelismLock,
hasRemote,
relocationIntervalId,
tssMapping,
ddEnabledState));
// This is defensive, but make sure that we always say that the movement is complete before moveKeys completes
@ -1228,6 +1413,13 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServ
for (auto& s : servers) {
tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));
tr.set(arena, serverListKeyFor(s.id()), serverListValue(s));
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
TraceEvent(SevError, "TSSIdentityMappingEnabled");
// hack key-backed map here since we can't really change CommitTransactionRef to a RYW transaction
Key uidRef = Codec<UID>::pack(s.id()).pack();
tr.set(arena, uidRef.withPrefix(tssMappingKeys.begin), uidRef);
}
}
std::vector<Tag> serverTags;

View File

@ -89,13 +89,14 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
ACTOR Future<Void> removeStorageServer(Database cx,
UID serverID,
Optional<UID> tssPairID, // if serverID is a tss, set to its ss pair id
MoveKeysLock lock,
const DDEnabledState* ddEnabledState);
// Removes the given storage server permanently from the database. It must already
// have no shards assigned to it. The storage server MUST NOT be added again after this
// (though a new storage server with a new unique ID may be recruited from the same fdbserver).
ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID);
ACTOR Future<bool> canRemoveStorageServer(Reference<ReadYourWritesTransaction> tr, UID serverID);
// Returns true if the given storage server has no keys assigned to it and may be safely removed
// Obviously that could change later!
ACTOR Future<Void> removeKeysFromFailedServer(Database cx,

View File

@ -1498,10 +1498,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
ACTOR Future<Void> commitQueue(TLogData* self) {
state Reference<LogData> logData;
state std::vector<Reference<LogData>> missingFinalCommit;
loop {
int foundCount = 0;
state std::vector<Reference<LogData>> missingFinalCommit;
for (auto it : self->id_data) {
if (!it.second->stopped) {
logData = it.second;

View File

@ -1925,10 +1925,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
ACTOR Future<Void> commitQueue(TLogData* self) {
state Reference<LogData> logData;
state std::vector<Reference<LogData>> missingFinalCommit;
loop {
int foundCount = 0;
state std::vector<Reference<LogData>> missingFinalCommit;
for (auto it : self->id_data) {
if (!it.second->stopped) {
logData = it.second;

View File

@ -158,6 +158,7 @@ struct ProxyCommitData {
EventMetricHandle<SingleKeyMutation> singleKeyMutationEvent;
std::map<UID, Reference<StorageInfo>> storageCache;
std::unordered_map<UID, StorageServerInterface> tssMapping;
std::map<Tag, Version> tag_popped;
Deque<std::pair<Version, Version>> txsPopVersions;
Version lastTxsPop;

View File

@ -308,9 +308,13 @@ ACTOR Future<int64_t> getMaxStorageServerQueueSize(Database cx, Reference<AsyncV
.detail("SS", servers[i].id());
throw attribute_not_found();
}
messages.push_back(timeoutError(itr->second.eventLogRequest.getReply(
EventLogRequest(StringRef(servers[i].id().toString() + "/StorageMetrics"))),
1.0));
// Ignore TSS in add delay mode since it can purposefully freeze forever
if (!servers[i].isTss() || !g_network->isSimulated() ||
g_simulator.tssMode != ISimulator::TSSMode::EnabledAddDelay) {
messages.push_back(timeoutError(itr->second.eventLogRequest.getReply(EventLogRequest(
StringRef(servers[i].id().toString() + "/StorageMetrics"))),
1.0));
}
}
wait(waitForAll(messages));
@ -516,7 +520,15 @@ ACTOR Future<bool> getStorageServersRecruiting(Database cx, WorkerInterface dist
1.0));
TraceEvent("StorageServersRecruiting").detail("Message", recruitingMessage.toString());
return recruitingMessage.getValue("State") == "Recruiting";
if (recruitingMessage.getValue("State") == "Recruiting") {
std::string tssValue;
// if we're tss recruiting, that's fine because that can block indefinitely if only 1 free storage process
if (!recruitingMessage.tryGetValue("IsTSS", tssValue) || tssValue == "False") {
return true;
}
}
return false;
} catch (Error& e) {
TraceEvent("QuietDatabaseFailure", distributorWorker.id())
.detail("Reason", "Failed to extract StorageServersRecruiting")

View File

@ -719,9 +719,11 @@ ACTOR Future<Void> trackEachStorageServer(
when(state std::pair<UID, Optional<StorageServerInterface>> change = waitNext(serverChanges)) {
wait(delay(0)); // prevent storageServerTracker from getting cancelled while on the call stack
if (change.second.present()) {
auto& a = actors[change.first];
a = Future<Void>();
a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err);
if (!change.second.get().isTss()) {
auto& a = actors[change.first];
a = Future<Void>();
a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err);
}
} else
actors.erase(change.first);
}

View File

@ -22,6 +22,7 @@
#include <fstream>
#include <ostream>
#include <sstream>
#include <toml.hpp>
#include "fdbrpc/Locality.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/DatabaseContext.h"
@ -37,8 +38,8 @@
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/versions.h"
#include "flow/ProtocolVersion.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include "flow/network.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#undef max
#undef min
@ -46,10 +47,236 @@
extern "C" int g_expect_full_pointermap;
extern const char* getSourceVersion();
using namespace std::literals;
const int MACHINE_REBOOT_TIME = 10;
bool destructed = false;
// Configuration details specified in workload test files that change the simulation
// environment details
class TestConfig {
class ConfigBuilder {
using value_type = toml::basic_value<toml::discard_comments>;
std::unordered_map<std::string_view, std::function<void(value_type const&)>> confMap;
public:
ConfigBuilder& add(std::string_view key, int* value) {
confMap.emplace(key, [value](value_type const& v) { *value = v.as_integer(); });
return *this;
}
ConfigBuilder& add(std::string_view key, Optional<int>* value) {
confMap.emplace(key, [value](value_type const& v) { *value = v.as_integer(); });
return *this;
}
ConfigBuilder& add(std::string_view key, bool* value) {
confMap.emplace(key, [value](value_type const& v) { *value = v.as_boolean(); });
return *this;
}
ConfigBuilder& add(std::string_view key, Optional<bool>* value) {
confMap.emplace(key, [value](value_type const& v) { *value = v.as_boolean(); });
return *this;
}
ConfigBuilder& add(std::string_view key, std::string* value) {
confMap.emplace(key, [value](value_type const& v) { *value = v.as_string(); });
return *this;
}
ConfigBuilder& add(std::string_view key, Optional<std::string>* value) {
confMap.emplace(key, [value](value_type const& v) { *value = v.as_string(); });
return *this;
}
ConfigBuilder& add(std::string_view key, std::vector<int>* value) {
confMap.emplace(key, [value](value_type const& v) {
auto arr = v.as_array();
for (const auto& i : arr) {
value->push_back(i.as_integer());
}
});
return *this;
}
void set(std::string const& key, value_type const& val) {
auto iter = confMap.find(key);
if (iter == confMap.end()) {
std::cerr << "Unknown configuration attribute " << key << std::endl;
TraceEvent("UnknownConfigurationAttribute").detail("Name", key);
throw unknown_error();
}
iter->second(val);
}
};
bool isIniFile(const char* fileName) {
std::string name = fileName;
auto pos = name.find_last_of('.');
ASSERT(pos != std::string::npos && pos + 1 < name.size());
auto extension = name.substr(pos + 1);
return extension == "txt"sv;
}
void loadIniFile(const char* testFile) {
std::ifstream ifs;
ifs.open(testFile, std::ifstream::in);
if (!ifs.good())
return;
std::string cline;
while (ifs.good()) {
getline(ifs, cline);
std::string line = removeWhitespace(std::string(cline));
if (!line.size() || line.find(';') == 0)
continue;
size_t found = line.find('=');
if (found == std::string::npos)
// hmmm, not good
continue;
std::string attrib = removeWhitespace(line.substr(0, found));
std::string value = removeWhitespace(line.substr(found + 1));
if (attrib == "extraDB") {
sscanf(value.c_str(), "%d", &extraDB);
}
if (attrib == "minimumReplication") {
sscanf(value.c_str(), "%d", &minimumReplication);
}
if (attrib == "minimumRegions") {
sscanf(value.c_str(), "%d", &minimumRegions);
}
if (attrib == "configureLocked") {
sscanf(value.c_str(), "%d", &configureLocked);
}
if (attrib == "startIncompatibleProcess") {
startIncompatibleProcess = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "logAntiQuorum") {
sscanf(value.c_str(), "%d", &logAntiQuorum);
}
if (attrib == "storageEngineExcludeTypes") {
std::stringstream ss(value);
for (int i; ss >> i;) {
storageEngineExcludeTypes.push_back(i);
if (ss.peek() == ',') {
ss.ignore();
}
}
}
if (attrib == "maxTLogVersion") {
sscanf(value.c_str(), "%d", &maxTLogVersion);
}
if (attrib == "restartInfoLocation") {
isFirstTestInRestart = true;
}
}
ifs.close();
}
public:
int extraDB = 0;
int minimumReplication = 0;
int minimumRegions = 0;
bool configureLocked = false;
bool startIncompatibleProcess = false;
int logAntiQuorum = -1;
bool isFirstTestInRestart = false;
// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
// 0 = "ssd"
// 1 = "memory"
// 2 = "memory-radixtree-beta"
// 3 = "ssd-redwood-experimental"
// Requires a comma-separated list of numbers WITHOUT whitespaces
std::vector<int> storageEngineExcludeTypes;
// Set the maximum TLog version that can be selected for a test
// Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version.
int maxTLogVersion = TLogVersion::MAX_SUPPORTED;
// Set true to simplify simulation configs for easier debugging
bool simpleConfig = false;
Optional<bool> generateFearless, buggify;
Optional<int> datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType,
stderrSeverity, machineCount, processesPerMachine, coordinators;
Optional<std::string> config;
bool tomlKeyPresent(const toml::value& data, std::string key) {
if (data.is_table()) {
for (const auto& [k, v] : data.as_table()) {
if (k == key || tomlKeyPresent(v, key)) {
return true;
}
}
} else if (data.is_array()) {
for (const auto& v : data.as_array()) {
if (tomlKeyPresent(v, key)) {
return true;
}
}
}
return false;
}
void readFromConfig(const char* testFile) {
if (isIniFile(testFile)) {
loadIniFile(testFile);
return;
}
ConfigBuilder builder;
builder.add("extraDB", &extraDB)
.add("minimumReplication", &minimumReplication)
.add("minimumRegions", &minimumRegions)
.add("configureLocked", &configureLocked)
.add("startIncompatibleProcess", &startIncompatibleProcess)
.add("logAntiQuorum", &logAntiQuorum)
.add("storageEngineExcludeTypes", &storageEngineExcludeTypes)
.add("maxTLogVersion", &maxTLogVersion)
.add("simpleConfig", &simpleConfig)
.add("generateFearless", &generateFearless)
.add("datacenters", &datacenters)
.add("desiredTLogCount", &desiredTLogCount)
.add("commitProxyCount", &commitProxyCount)
.add("grvProxyCount", &grvProxyCount)
.add("resolverCount", &resolverCount)
.add("storageEngineType", &storageEngineType)
.add("config", &config)
.add("buggify", &buggify)
.add("StderrSeverity", &stderrSeverity)
.add("machineCount", &machineCount)
.add("processesPerMachine", &processesPerMachine)
.add("coordinators", &coordinators);
try {
auto file = toml::parse(testFile);
if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
auto conf = toml::find(file, "configuration").as_table();
for (const auto& [key, value] : conf) {
if (key == "ClientInfoLogging") {
setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING);
} else if (key == "restartInfoLocation") {
isFirstTestInRestart = true;
} else {
builder.set(key, value);
}
}
if (stderrSeverity.present()) {
TraceEvent("StderrSeverity").detail("NewSeverity", stderrSeverity.get());
}
}
// look for restartInfoLocation to mark isFirstTestInRestart
if (!isFirstTestInRestart) {
isFirstTestInRestart = tomlKeyPresent(file, "restartInfoLocation");
}
} catch (std::exception& e) {
std::cerr << e.what() << std::endl;
TraceEvent("TOMLParseError").detail("Error", printable(e.what()));
throw unknown_error();
}
}
};
template <class T>
T simulate(const T& in) {
BinaryWriter writer(AssumeVersion(g_network->protocolVersion()));
@ -507,8 +734,8 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
// Copy the file pointers to a vector because the map may be modified while we are killing files
std::vector<AsyncFileNonDurable*> files;
for (auto fileItr = machineCache.begin(); fileItr != machineCache.end(); ++fileItr) {
ASSERT(fileItr->second.isReady());
files.push_back((AsyncFileNonDurable*)fileItr->second.get().getPtr());
ASSERT(fileItr->second.get().isReady());
files.push_back((AsyncFileNonDurable*)fileItr->second.get().get().getPtr());
}
std::vector<Future<Void>> killFutures;
@ -524,7 +751,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
for (auto it : machineCache) {
filenames.insert(it.first);
closingStr += it.first + ", ";
ASSERT(it.second.isReady() && !it.second.isError());
ASSERT(it.second.get().canGet());
}
for (auto it : g_simulator.getMachineById(localities.machineId())->deletingFiles) {
@ -885,31 +1112,59 @@ StringRef StringRefOf(const char* s) {
// of different combinations
void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
set_config("new");
const bool simple = false; // Set true to simplify simulation configs for easier debugging
// generateMachineTeamTestConfig set up the number of servers per machine and the number of machines such that
// if we do not remove the surplus server and machine teams, the simulation test will report error.
// This is needed to make sure the number of server (and machine) teams is no larger than the desired number.
bool generateMachineTeamTestConfig = BUGGIFY_WITH_PROB(0.1) ? true : false;
bool generateFearless = simple ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5);
datacenters = simple ? 1
: (generateFearless
? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6)
: deterministicRandom()->randomInt(1, 4));
if (deterministicRandom()->random01() < 0.25)
db.desiredTLogCount = deterministicRandom()->randomInt(1, 7);
if (deterministicRandom()->random01() < 0.25)
db.commitProxyCount = deterministicRandom()->randomInt(1, 7);
if (deterministicRandom()->random01() < 0.25)
db.grvProxyCount = deterministicRandom()->randomInt(1, 4);
if (deterministicRandom()->random01() < 0.25)
db.resolverCount = deterministicRandom()->randomInt(1, 7);
int storage_engine_type = deterministicRandom()->randomInt(0, 4);
// Continuously re-pick the storage engine type if it's the one we want to exclude
while (std::find(testConfig.storageEngineExcludeTypes.begin(),
testConfig.storageEngineExcludeTypes.end(),
storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
storage_engine_type = deterministicRandom()->randomInt(0, 4);
bool generateFearless =
testConfig.simpleConfig ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5);
if (testConfig.generateFearless.present()) {
// overwrite whatever decision we made before
generateFearless = testConfig.generateFearless.get();
}
datacenters =
testConfig.simpleConfig
? 1
: (generateFearless ? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6)
: deterministicRandom()->randomInt(1, 4));
if (testConfig.datacenters.present()) {
datacenters = testConfig.datacenters.get();
}
if (testConfig.desiredTLogCount.present()) {
db.desiredTLogCount = testConfig.desiredTLogCount.get();
} else if (deterministicRandom()->random01() < 0.25) {
db.desiredTLogCount = deterministicRandom()->randomInt(1, 7);
}
if (testConfig.commitProxyCount.present()) {
db.commitProxyCount = testConfig.commitProxyCount.get();
} else if (deterministicRandom()->random01() < 0.25) {
db.commitProxyCount = deterministicRandom()->randomInt(1, 7);
}
if (testConfig.grvProxyCount.present()) {
db.grvProxyCount = testConfig.grvProxyCount.get();
} else if (deterministicRandom()->random01() < 0.25) {
db.grvProxyCount = deterministicRandom()->randomInt(1, 4);
}
if (testConfig.resolverCount.present()) {
db.resolverCount = testConfig.resolverCount.get();
} else if (deterministicRandom()->random01() < 0.25) {
db.resolverCount = deterministicRandom()->randomInt(1, 7);
}
int storage_engine_type = deterministicRandom()->randomInt(0, 4);
if (testConfig.storageEngineType.present()) {
storage_engine_type = testConfig.storageEngineType.get();
} else {
// Continuously re-pick the storage engine type if it's the one we want to exclude
while (std::find(testConfig.storageEngineExcludeTypes.begin(),
testConfig.storageEngineExcludeTypes.end(),
storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
storage_engine_type = deterministicRandom()->randomInt(0, 4);
}
}
switch (storage_engine_type) {
case 0: {
TEST(true); // Simulated cluster using ssd storage engine
@ -934,6 +1189,13 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
default:
ASSERT(false); // Programmer forgot to adjust cases.
}
int tssCount = 0;
if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) {
// 1 or 2 tss
tssCount = deterministicRandom()->randomInt(1, 3);
}
// if (deterministicRandom()->random01() < 0.5) {
// set_config("ssd");
// } else {
@ -941,75 +1203,81 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
// }
// set_config("memory");
// set_config("memory-radixtree-beta");
if (simple) {
if (testConfig.simpleConfig) {
db.desiredTLogCount = 1;
db.commitProxyCount = 1;
db.grvProxyCount = 1;
db.resolverCount = 1;
}
int replication_type = simple ? 1
: (std::max(testConfig.minimumReplication,
datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
: std::min(deterministicRandom()->randomInt(0, 6), 3)));
switch (replication_type) {
case 0: {
TEST(true); // Simulated cluster using custom redundancy mode
int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5);
// FIXME: log replicas must be more than storage replicas because otherwise better master exists will not
// recognize it needs to change dcs
int replication_factor = deterministicRandom()->randomInt(storage_servers, generateFearless ? 4 : 5);
int anti_quorum = deterministicRandom()->randomInt(
0,
(replication_factor / 2) + 1); // The anti quorum cannot be more than half of the replication factor, or the
// log system will continue to accept commits when a recovery is impossible
// Go through buildConfiguration, as it sets tLogPolicy/storagePolicy.
set_config(format("storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d "
"replica_datacenters:=1 min_replica_datacenters:=1",
storage_servers,
replication_factor,
anti_quorum));
break;
}
case 1: {
TEST(true); // Simulated cluster running in single redundancy mode
set_config("single");
break;
}
case 2: {
TEST(true); // Simulated cluster running in double redundancy mode
set_config("double");
break;
}
case 3: {
if (datacenters <= 2 || generateFearless) {
TEST(true); // Simulated cluster running in triple redundancy mode
set_config("triple");
} else if (datacenters == 3) {
TEST(true); // Simulated cluster running in 3 data-hall mode
set_config("three_data_hall");
} else {
ASSERT(false);
}
break;
}
default:
ASSERT(false); // Programmer forgot to adjust cases.
}
if (deterministicRandom()->random01() < 0.5) {
int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END);
set_config(format("log_spill:=%d", logSpill));
int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1);
set_config(format("log_version:=%d", logVersion));
int replication_type = testConfig.simpleConfig
? 1
: (std::max(testConfig.minimumReplication,
datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
: std::min(deterministicRandom()->randomInt(0, 6), 3)));
if (testConfig.config.present()) {
set_config(testConfig.config.get());
} else {
if (deterministicRandom()->random01() < 0.7)
set_config(format("log_version:=%d", testConfig.maxTLogVersion));
if (deterministicRandom()->random01() < 0.5)
set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
}
switch (replication_type) {
case 0: {
TEST(true); // Simulated cluster using custom redundancy mode
int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5);
// FIXME: log replicas must be more than storage replicas because otherwise better master exists will not
// recognize it needs to change dcs
int replication_factor = deterministicRandom()->randomInt(storage_servers, generateFearless ? 4 : 5);
int anti_quorum = deterministicRandom()->randomInt(
0,
(replication_factor / 2) +
1); // The anti quorum cannot be more than half of the replication factor, or the
// log system will continue to accept commits when a recovery is impossible
// Go through buildConfiguration, as it sets tLogPolicy/storagePolicy.
set_config(format("storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d "
"replica_datacenters:=1 min_replica_datacenters:=1",
storage_servers,
replication_factor,
anti_quorum));
break;
}
case 1: {
TEST(true); // Simulated cluster running in single redundancy mode
set_config("single");
break;
}
case 2: {
TEST(true); // Simulated cluster running in double redundancy mode
set_config("double");
break;
}
case 3: {
if (datacenters <= 2 || generateFearless) {
TEST(true); // Simulated cluster running in triple redundancy mode
set_config("triple");
} else if (datacenters == 3) {
TEST(true); // Simulated cluster running in 3 data-hall mode
set_config("three_data_hall");
} else {
ASSERT(false);
}
break;
}
default:
ASSERT(false); // Programmer forgot to adjust cases.
}
if (deterministicRandom()->random01() < 0.5) {
int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END);
set_config(format("log_spill:=%d", logSpill));
int logVersion =
deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1);
set_config(format("log_version:=%d", logVersion));
} else {
if (deterministicRandom()->random01() < 0.7)
set_config(format("log_version:=%d", testConfig.maxTLogVersion));
if (deterministicRandom()->random01() < 0.5)
set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
}
if (deterministicRandom()->random01() < 0.5) {
set_config("backup_worker_enabled:=1");
if (deterministicRandom()->random01() < 0.5) {
set_config("backup_worker_enabled:=1");
}
}
if (generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) {
@ -1211,7 +1479,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
}
}
if (generateFearless && testConfig.minimumReplication > 1) {
if (testConfig.machineCount.present()) {
machine_count = testConfig.machineCount.get();
} else if (generateFearless && testConfig.minimumReplication > 1) {
// low latency tests in fearless configurations need 4 machines per datacenter (3 for triple replication, 1 that
// is down during failures).
machine_count = 16;
@ -1234,11 +1504,15 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
}
}
// because we protect a majority of coordinators from being killed, it is better to run with low numbers of
// coordinators to prevent too many processes from being protected
coordinators = (testConfig.minimumRegions <= 1 && BUGGIFY)
? deterministicRandom()->randomInt(1, std::max(machine_count, 2))
: 1;
if (testConfig.coordinators.present()) {
coordinators = testConfig.coordinators.get();
} else {
// because we protect a majority of coordinators from being killed, it is better to run with low numbers of
// coordinators to prevent too many processes from being protected
coordinators = (testConfig.minimumRegions <= 1 && BUGGIFY)
? deterministicRandom()->randomInt(1, std::max(machine_count, 2))
: 1;
}
if (testConfig.minimumReplication > 1 && datacenters == 3) {
// low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for
@ -1247,11 +1521,35 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
coordinators = 3;
}
if (generateFearless) {
if (testConfig.processesPerMachine.present()) {
processes_per_machine = testConfig.processesPerMachine.get();
} else if (generateFearless) {
processes_per_machine = 1;
} else {
processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2);
}
// reduce tss to half of extra non-seed servers that can be recruited in usable regions.
tssCount =
std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2));
if (!testConfig.config.present() && tssCount > 0) {
std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType);
set_config(confStr);
double tssRandom = deterministicRandom()->random01();
if (tssRandom > 0.5) {
// normal tss mode
g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal;
} else if (tssRandom < 0.25 && !testConfig.isFirstTestInRestart) {
// fault injection - don't enable in first test in restart because second test won't know it intentionally
// lost data
g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations;
} else {
// delay injection
g_simulator.tssMode = ISimulator::TSSMode::EnabledAddDelay;
}
printf("enabling tss for simulation in mode %d: %s\n", g_simulator.tssMode, confStr.c_str());
}
}
// Configures the system according to the given specifications in order to run
@ -1275,6 +1573,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
startingConfigString += " locked";
}
for (auto kv : startingConfigJSON) {
if ("tss_storage_engine" == kv.first) {
continue;
}
startingConfigString += " ";
if (kv.second.type() == json_spirit::int_type) {
startingConfigString += kv.first + ":=" + format("%d", kv.second.get_int());
@ -1289,6 +1590,12 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
}
}
// handle tss_storage_engine separately because the passthrough needs the enum ordinal, but it's serialized to json
// as the string name
if (simconfig.db.desiredTSSCount > 0) {
startingConfigString += format(" tss_storage_engine:=%d", simconfig.db.testingStorageServerStoreType);
}
if (g_simulator.originalRegions != "") {
simconfig.set_config(g_simulator.originalRegions);
g_simulator.startingDisabledConfiguration = startingConfigString + " " + g_simulator.disableRemote;
@ -1363,6 +1670,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
TEST(!useIPv6); // Use IPv4
vector<NetworkAddress> coordinatorAddresses;
vector<NetworkAddress> extraCoordinatorAddresses; // Used by extra DB if the DR db is a new one
if (testConfig.minimumRegions > 1) {
// do not put coordinators in the primary region so that we can kill that region safely
int nonPrimaryDcs = dataCenters / 2;
@ -1372,6 +1680,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
auto ip = makeIPAddressForSim(useIPv6, { 2, dc, 1, m });
coordinatorAddresses.push_back(
NetworkAddress(ip, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m });
extraCoordinatorAddresses.push_back(
NetworkAddress(extraIp, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
TraceEvent("SelectedCoordinator").detail("Address", coordinatorAddresses.back());
}
}
@ -1400,6 +1711,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
auto ip = makeIPAddressForSim(useIPv6, { 2, dc, 1, m });
coordinatorAddresses.push_back(
NetworkAddress(ip, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m });
extraCoordinatorAddresses.push_back(
NetworkAddress(extraIp, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
TraceEvent("SelectedCoordinator")
.detail("Address", coordinatorAddresses.back())
.detail("M", m)
@ -1436,11 +1750,13 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
// If extraDB==0, leave g_simulator.extraDB as null because the test does not use DR.
if (testConfig.extraDB == 1) {
// The DR database can be either a new database or itself
g_simulator.extraDB = new ClusterConnectionString(
coordinatorAddresses, BUGGIFY ? LiteralStringRef("TestCluster:0") : LiteralStringRef("ExtraCluster:0"));
g_simulator.extraDB =
BUGGIFY ? new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0"))
: new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"));
} else if (testConfig.extraDB == 2) {
// The DR database is a new database
g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("ExtraCluster:0"));
g_simulator.extraDB =
new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"));
} else if (testConfig.extraDB == 3) {
// The DR database is the same database
g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0"));
@ -1626,68 +1942,10 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
.detail("StartingConfiguration", pStartingConfiguration->toString());
}
using namespace std::literals;
// Populates the TestConfig fields according to what is found in the test file.
void checkTestConf(const char* testFile, TestConfig* testConfig) {
std::ifstream ifs;
ifs.open(testFile, std::ifstream::in);
if (!ifs.good())
return;
std::string cline;
while (ifs.good()) {
getline(ifs, cline);
std::string line = removeWhitespace(std::string(cline));
if (!line.size() || line.find(';') == 0)
continue;
size_t found = line.find('=');
if (found == std::string::npos)
// hmmm, not good
continue;
std::string attrib = removeWhitespace(line.substr(0, found));
std::string value = removeWhitespace(line.substr(found + 1));
if (attrib == "extraDB") {
sscanf(value.c_str(), "%d", &testConfig->extraDB);
}
if (attrib == "minimumReplication") {
sscanf(value.c_str(), "%d", &testConfig->minimumReplication);
}
if (attrib == "minimumRegions") {
sscanf(value.c_str(), "%d", &testConfig->minimumRegions);
}
if (attrib == "configureLocked") {
sscanf(value.c_str(), "%d", &testConfig->configureLocked);
}
if (attrib == "startIncompatibleProcess") {
testConfig->startIncompatibleProcess = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "logAntiQuorum") {
sscanf(value.c_str(), "%d", &testConfig->logAntiQuorum);
}
if (attrib == "storageEngineExcludeTypes") {
std::stringstream ss(value);
for (int i; ss >> i;) {
testConfig->storageEngineExcludeTypes.push_back(i);
if (ss.peek() == ',') {
ss.ignore();
}
}
}
if (attrib == "maxTLogVersion") {
sscanf(value.c_str(), "%d", &testConfig->maxTLogVersion);
}
}
ifs.close();
}
void checkTestConf(const char* testFile, TestConfig* testConfig) {}
ACTOR void setupAndRun(std::string dataFolder,
const char* testFile,
@ -1699,7 +1957,7 @@ ACTOR void setupAndRun(std::string dataFolder,
state Standalone<StringRef> startingConfiguration;
state int testerCount = 1;
state TestConfig testConfig;
checkTestConf(testFile, &testConfig);
testConfig.readFromConfig(testFile);
g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess;
g_simulator.setDiffProtocol = false;

View File

@ -387,6 +387,19 @@ JsonBuilderObject getLagObject(int64_t versions) {
return lag;
}
static JsonBuilderObject getBounceImpactInfo(int recoveryStatusCode) {
JsonBuilderObject bounceImpact;
if (recoveryStatusCode == RecoveryStatus::fully_recovered) {
bounceImpact["can_clean_bounce"] = true;
} else {
bounceImpact["can_clean_bounce"] = false;
bounceImpact["reason"] = "cluster hasn't fully recovered yet";
}
return bounceImpact;
}
struct MachineMemoryInfo {
double memoryUsage;
double aggregateLimit;
@ -478,6 +491,8 @@ struct RolesInfo {
obj["mutation_bytes"] = StatusCounter(storageMetrics.getValue("MutationBytes")).getStatus();
obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus();
obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate"));
obj["fetched_versions"] = StatusCounter(storageMetrics.getValue("FetchedVersions")).getStatus();
obj["fetches_from_logs"] = StatusCounter(storageMetrics.getValue("FetchesFromLogs")).getStatus();
Version version = storageMetrics.getInt64("Version");
Version durableVersion = storageMetrics.getInt64("DurableVersion");
@ -615,7 +630,7 @@ struct RolesInfo {
TraceEventFields const& commitLatencyBands = metrics.at("CommitLatencyBands");
if (commitLatencyBands.size()) {
obj["commit_latency_bands"] = addLatencyBandInfo(commitLatencyBands);
}
}
TraceEventFields const& commitBatchingWindowSize = metrics.at("CommitBatchingWindowSize");
if (commitBatchingWindowSize.size()) {
@ -1169,6 +1184,7 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(Database cx,
} else if (mStatusCode == RecoveryStatus::locking_old_transaction_servers) {
message["missing_logs"] = md.getValue("MissingIDs").c_str();
}
// TODO: time_in_recovery: 0.5
// time_in_state: 0.1
@ -1853,10 +1869,10 @@ ACTOR static Future<vector<std::pair<TLogInterface, EventMap>>> getTLogsAndMetri
ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommitProxiesAndMetrics(
Reference<AsyncVar<ServerDBInfo>> db,
std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
vector<std::pair<CommitProxyInterface, EventMap>> results =
wait(getServerMetrics(db->get().client.commitProxies,
address_workers,
std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize"}));
vector<std::pair<CommitProxyInterface, EventMap>> results = wait(getServerMetrics(
db->get().client.commitProxies,
address_workers,
std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize" }));
return results;
}
@ -1864,10 +1880,10 @@ ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommit
ACTOR static Future<vector<std::pair<GrvProxyInterface, EventMap>>> getGrvProxiesAndMetrics(
Reference<AsyncVar<ServerDBInfo>> db,
std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
vector<std::pair<GrvProxyInterface, EventMap>> results =
wait(getServerMetrics(db->get().client.grvProxies,
address_workers,
std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" }));
vector<std::pair<GrvProxyInterface, EventMap>> results = wait(
getServerMetrics(db->get().client.grvProxies,
address_workers,
std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" }));
return results;
}
@ -2775,6 +2791,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
statusObj["protocol_version"] = format("%" PRIx64, g_network->protocolVersion().version());
statusObj["connection_string"] = coordinators.ccf->getConnectionString().toString();
statusObj["bounce_impact"] = getBounceImpactInfo(statusCode);
state Optional<DatabaseConfiguration> configuration;
state Optional<LoadConfigurationResult> loadResult;
@ -2988,6 +3005,14 @@ ACTOR Future<StatusReply> clusterGetStatus(
statusObj["incompatible_connections"] = incompatibleConnectionsArray;
statusObj["datacenter_lag"] = getLagObject(datacenterVersionDifference);
int activeTSSCount = 0;
for (auto& it : storageServers) {
if (it.first.isTss()) {
activeTSSCount++;
}
}
statusObj["active_tss_count"] = activeTSSCount;
int totalDegraded = 0;
for (auto& it : workers) {
if (it.degraded) {

View File

@ -1965,10 +1965,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
ACTOR Future<Void> commitQueue(TLogData* self) {
state Reference<LogData> logData;
state std::vector<Reference<LogData>> missingFinalCommit;
loop {
int foundCount = 0;
state std::vector<Reference<LogData>> missingFinalCommit;
for (auto it : self->id_data) {
if (!it.second->stopped) {
logData = it.second;

View File

@ -0,0 +1,72 @@
/*
* TSSMappingUtil.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/SystemData.h"
#include "fdbclient/KeyBackedTypes.h"
#include "fdbserver/TSSMappingUtil.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR Future<Void> readTSSMappingRYW(Reference<ReadYourWritesTransaction> tr, std::map<UID, StorageServerInterface>* tssMapping) {
KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
state std::vector<std::pair<UID, UID>> uidMapping = wait(tssMapDB.getRange(tr, UID(), Optional<UID>(), CLIENT_KNOBS->TOO_MANY));
ASSERT(uidMapping.size() < CLIENT_KNOBS->TOO_MANY);
state std::map<UID, StorageServerInterface> mapping;
for (auto& it : uidMapping) {
state UID ssId = it.first;
Optional<Value> v = wait(tr->get(serverListKeyFor(it.second)));
(*tssMapping)[ssId] = decodeServerListValue(v.get());
}
return Void();
}
ACTOR Future<Void> readTSSMapping(Transaction* tr, std::map<UID, StorageServerInterface>* tssMapping) {
state RangeResult mappingList = wait(tr->getRange(tssMappingKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!mappingList.more && mappingList.size() < CLIENT_KNOBS->TOO_MANY);
for (auto& it : mappingList) {
state UID ssId = Codec<UID>::unpack(Tuple::unpack(it.key.removePrefix(tssMappingKeys.begin)));
UID tssId = Codec<UID>::unpack(Tuple::unpack(it.value));
Optional<Value> v = wait(tr->get(serverListKeyFor(tssId)));
(*tssMapping)[ssId] = decodeServerListValue(v.get());
}
return Void();
}
ACTOR Future<Void> removeTSSPairsFromCluster(Database cx, vector<std::pair<UID, UID>> pairsToRemove) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
loop {
try {
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
for (auto& tssPair : pairsToRemove) {
// DO NOT remove server list key - that'll break a bunch of stuff. DD will eventually call removeStorageServer
tr->clear(serverTagKeyFor(tssPair.second));
tssMapDB.erase(tr, tssPair.first);
}
wait(tr->commit());
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
return Void();
}

View File

@ -0,0 +1,48 @@
/*
* TSSMappingUtil.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version.
#if defined(NO_INTELLISENSE) && !defined(TSS_MAPPING_UTIL_SERVER_G_H)
#define TSS_MAPPING_UTIL_SERVER_G_H
#include "fdbserver/TSSMappingUtil.actor.g.h"
#elif !defined(TSS_MAPPING_UTIL_SERVER_H)
#define TSS_MAPPING_UTIL_SERVER_H
#include "fdbclient/StorageServerInterface.h"
#include "flow/actorcompiler.h" // This must be the last #include.
/*
* Collection of utility functions for dealing with the TSS mapping
*/
// Reads the current cluster TSS mapping as part of the RYW transaction
ACTOR Future<Void> readTSSMappingRYW(Reference<ReadYourWritesTransaction> tr, std::map<UID, StorageServerInterface>* tssMapping);
// Reads the current cluster TSS mapping as part of the given Transaction
ACTOR Future<Void> readTSSMapping(Transaction* tr, std::map<UID, StorageServerInterface>* tssMapping);
// Removes the TSS pairs from the cluster
ACTOR Future<Void> removeTSSPairsFromCluster(Database cx, vector<std::pair<UID, UID>> pairsToRemove);
#include "flow/unactorcompiler.h"
#endif

View File

@ -100,27 +100,6 @@ struct WorkloadRequest {
}
};
// Configuration details specified in workload test files that change the simulation
// environment details
struct TestConfig {
int extraDB = 0;
int minimumReplication = 0;
int minimumRegions = 0;
int configureLocked = 0;
bool startIncompatibleProcess = false;
int logAntiQuorum = -1;
// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
// 0 = "ssd"
// 1 = "memory"
// 2 = "memory-radixtree-beta"
// 3 = "ssd-redwood-experimental"
// Requires a comma-separated list of numbers WITHOUT whitespaces
std::vector<int> storageEngineExcludeTypes;
// Set the maximum TLog version that can be selected for a test
// Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version.
int maxTLogVersion = TLogVersion::MAX_SUPPORTED;
};
struct TesterInterface {
constexpr static FileIdentifier file_identifier = 4465210;
RequestStream<WorkloadRequest> recruitments;

File diff suppressed because it is too large Load Diff

View File

@ -614,11 +614,13 @@ struct InitializeStorageRequest {
UID reqId;
UID interfaceId;
KeyValueStoreType storeType;
Optional<std::pair<UID, Version>>
tssPairIDAndVersion; // Only set if recruiting a tss. Will be the UID and Version of its SS pair.
ReplyPromise<InitializeStorageReply> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, seedTag, reqId, interfaceId, storeType, reply);
serializer(ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion);
}
};
@ -770,6 +772,7 @@ struct DiskStoreRequest {
struct Role {
static const Role WORKER;
static const Role STORAGE_SERVER;
static const Role TESTING_STORAGE_SERVER;
static const Role TRANSACTION_LOG;
static const Role SHARED_TRANSACTION_LOG;
static const Role COMMIT_PROXY;
@ -840,6 +843,7 @@ class IDiskQueue;
ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
StorageServerInterface ssi,
Tag seedTag,
Version tssSeedVersion,
ReplyPromise<InitializeStorageReply> recruitReply,
Reference<AsyncVar<ServerDBInfo>> db,
std::string folder);

View File

@ -38,6 +38,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/KeyBackedTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbclient/StatusClient.h"
@ -463,7 +464,7 @@ public:
void byteSampleApplyClear(KeyRangeRef range, Version ver);
void popVersion(Version v, bool popAllTags = false) {
if (logSystem) {
if (logSystem && !isTss()) {
if (v > poppedAllAfter) {
popAllTags = true;
poppedAllAfter = std::numeric_limits<Version>::max();
@ -510,6 +511,21 @@ public:
return mLV.push_back_deep(mLV.arena(), m);
}
void setTssPair(UID pairId) {
tssPairID = Optional<UID>(pairId);
// Set up tss fault injection here, only if we are in simulated mode and with fault injection.
// With fault injection enabled, the tss will start acting normal for a bit, then after the specified delay
// start behaving incorrectly.
if (g_network->isSimulated() && !g_simulator.speedUpSimulation &&
g_simulator.tssMode >= ISimulator::TSSMode::EnabledAddDelay) {
tssFaultInjectTime = now() + deterministicRandom()->randomInt(60, 300);
TraceEvent(SevWarnAlways, "TSSInjectFaultEnabled", thisServerID)
.detail("Mode", g_simulator.tssMode)
.detail("At", tssFaultInjectTime.get());
}
}
StorageServerDisk storage;
KeyRangeMap<Reference<ShardInfo>> shards;
@ -544,12 +560,17 @@ public:
int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage
// server
Optional<UID> sourceTLogID; // the tLog from which the latest batch of versions were fetched
ProtocolVersion logProtocol;
Reference<ILogSystem> logSystem;
Reference<ILogSystem::IPeekCursor> logCursor;
UID thisServerID;
Optional<UID> tssPairID; // if this server is a tss, this is the id of its (ss) pair
Optional<UID> ssPairID; // if this server is an ss, this is the id of its (tss) pair
Optional<double> tssFaultInjectTime;
Key sk;
Reference<AsyncVar<ServerDBInfo>> db;
Database cx;
@ -677,6 +698,8 @@ public:
Counter loops;
Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount;
Counter readsRejected;
Counter fetchedVersions;
Counter fetchesFromLogs;
LatencySample readLatencySample;
LatencyBands readLatencyBands;
@ -694,10 +717,11 @@ public:
updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics",
self->thisServerID,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
readsRejected("ReadsRejected", cc), fetchedVersions("FetchedVersions", cc),
fetchesFromLogs("FetchesFromLogs", cc), readLatencySample("ReadLatencyMetrics",
self->thisServerID,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
specialCounter(cc, "Version", [self]() { return self->version.get(); });
@ -780,6 +804,14 @@ public:
mutableData().forgetVersionsBefore(ver);
}
bool isTss() const { return tssPairID.present(); }
bool isSSWithTSSPair() const { return ssPairID.present(); }
void setSSWithTssPair(UID idOfTSS) { ssPairID = Optional<UID>(idOfTSS); }
void clearSSWithTssPair() { ssPairID = Optional<UID>(); }
// This is the maximum version that might be read from storage (the minimum version is durableVersion)
Version storageVersion() const { return oldestVersion.get(); }
@ -1155,13 +1187,12 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
DEBUG_MUTATION("ShardGetValue",
version,
MutationRef(MutationRef::DebugKey, req.key, v.present() ? v.get() : LiteralStringRef("<null>")));
DEBUG_MUTATION("ShardGetPath",
version,
MutationRef(MutationRef::DebugKey,
req.key,
path == 0 ? LiteralStringRef("0")
: path == 1 ? LiteralStringRef("1")
: LiteralStringRef("2")));
DEBUG_MUTATION(
"ShardGetPath",
version,
MutationRef(MutationRef::DebugKey,
req.key,
path == 0 ? LiteralStringRef("0") : path == 1 ? LiteralStringRef("1") : LiteralStringRef("2")));
/*
StorageMetrics m;
@ -1718,7 +1749,9 @@ ACTOR Future<Key> findKey(StorageServer* data,
if (sel.offset <= 1 && sel.offset >= 0)
maxBytes = std::numeric_limits<int>::max();
else
maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES;
maxBytes = (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::Disabled && BUGGIFY)
? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES
: SERVER_KNOBS->STORAGE_LIMIT_BYTES;
state GetKeyValuesReply rep = wait(
readRange(data,
@ -1775,10 +1808,10 @@ ACTOR Future<Key> findKey(StorageServer* data,
// This is possible if key/value pairs are very large and only one result is returned on a last less than
// query SOMEDAY: graceful handling of exceptionally sized values
ASSERT(returnKey != sel.getKey());
return returnKey;
} else
} else {
return forward ? range.end : range.begin;
}
}
}
@ -1849,6 +1882,7 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
: findKey(data, req.end, version, shard, &offset2, span.context);
state Key begin = wait(fBegin);
state Key end = wait(fEnd);
if (req.debugID.present())
g_traceBatch.addEvent(
"TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys");
@ -1973,6 +2007,7 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
try {
state Version version = wait(waitForVersion(data, req.version, req.spanContext));
state uint64_t changeCounter = data->shardChangeCounter;
state KeyRange shard = getShardKeyRange(data, req.sel);
@ -2927,32 +2962,30 @@ void changeServerKeys(StorageServer* data,
ChangeServerKeysContext context) {
ASSERT(!keys.empty());
//TraceEvent("ChangeServerKeys", data->thisServerID)
// .detail("KeyBegin", keys.begin)
// .detail("KeyEnd", keys.end)
// .detail("NowAssigned", nowAssigned)
// .detail("Version", version)
// .detail("Context", changeServerKeysContextName[(int)context]);
// TraceEvent("ChangeServerKeys", data->thisServerID)
// .detail("KeyBegin", keys.begin)
// .detail("KeyEnd", keys.end)
// .detail("NowAssigned", nowAssigned)
// .detail("Version", version)
// .detail("Context", changeServerKeysContextName[(int)context]);
validate(data);
// TODO(alexmiller): Figure out how to selectively enable spammy data distribution events.
// DEBUG_KEY_RANGE( nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys );
DEBUG_KEY_RANGE(nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys);
bool isDifferent = false;
auto existingShards = data->shards.intersectingRanges(keys);
for (auto it = existingShards.begin(); it != existingShards.end(); ++it) {
if (nowAssigned != it->value()->assigned()) {
isDifferent = true;
/*TraceEvent("CSKRangeDifferent", data->thisServerID)
.detail("KeyBegin", it->range().begin)
.detail("KeyEnd", it->range().end);*/
TraceEvent("CSKRangeDifferent", data->thisServerID)
.detail("KeyBegin", it->range().begin)
.detail("KeyEnd", it->range().end);
break;
}
}
if (!isDifferent) {
//TraceEvent("CSKShortCircuit", data->thisServerID)
// .detail("KeyBegin", keys.begin)
// .detail("KeyEnd", keys.end);
// TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end);
return;
}
@ -2990,13 +3023,13 @@ void changeServerKeys(StorageServer* data,
for (auto r = vr.begin(); r != vr.end(); ++r) {
KeyRangeRef range = keys & r->range();
bool dataAvailable = r->value() == latestVersion || r->value() >= version;
/*TraceEvent("CSKRange", data->thisServerID)
.detail("KeyBegin", range.begin)
.detail("KeyEnd", range.end)
.detail("Available", dataAvailable)
.detail("NowAssigned", nowAssigned)
.detail("NewestAvailable", r->value())
.detail("ShardState0", data->shards[range.begin]->debugDescribeState());*/
// TraceEvent("CSKRange", data->thisServerID)
// .detail("KeyBegin", range.begin)
// .detail("KeyEnd", range.end)
// .detail("Available", dataAvailable)
// .detail("NowAssigned", nowAssigned)
// .detail("NewestAvailable", r->value())
// .detail("ShardState0", data->shards[range.begin]->debugDescribeState());
if (!nowAssigned) {
if (dataAvailable) {
ASSERT(r->value() ==
@ -3098,6 +3131,7 @@ static const KeyValueRef persistFormat(LiteralStringRef(PERSIST_PREFIX "Format")
static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("FoundationDB/StorageServer/1/2"),
LiteralStringRef("FoundationDB/StorageServer/1/5"));
static const KeyRef persistID = LiteralStringRef(PERSIST_PREFIX "ID");
static const KeyRef persistTssPairID = LiteralStringRef(PERSIST_PREFIX "tssPairID");
// (Potentially) change with the durable version or when fetchKeys completes
static const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version");
@ -3213,10 +3247,17 @@ private:
throw worker_removed();
} else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) &&
m.param1.substr(1).startsWith(serverTagPrefix)) {
bool matchesThisServer = decodeServerTagKey(m.param1.substr(1)) == data->thisServerID;
if ((m.type == MutationRef::SetValue && !matchesThisServer) ||
(m.type == MutationRef::ClearRange && matchesThisServer))
UID serverTagKey = decodeServerTagKey(m.param1.substr(1));
bool matchesThisServer = serverTagKey == data->thisServerID;
bool matchesTssPair = data->isTss() ? serverTagKey == data->tssPairID.get() : false;
if ((m.type == MutationRef::SetValue && !data->isTss() && !matchesThisServer) ||
(m.type == MutationRef::ClearRange && (matchesThisServer || (data->isTss() && matchesTssPair)))) {
throw worker_removed();
}
if (!data->isTss() && m.type == MutationRef::ClearRange && data->ssPairID.present() &&
serverTagKey == data->ssPairID.get()) {
data->clearSSWithTssPair();
}
} else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) {
data->rebootAfterDurableVersion = currentVersion;
TraceEvent("RebootWhenDurableSet", data->thisServerID)
@ -3226,6 +3267,13 @@ private:
data->primaryLocality = BinaryReader::fromStringRef<int8_t>(m.param2, Unversioned());
auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion());
data->addMutationToMutationLog(mLV, MutationRef(MutationRef::SetValue, persistPrimaryLocality, m.param2));
} else if (m.type == MutationRef::SetValue && m.param1.substr(1).startsWith(tssMappingKeys.begin)) {
if (!data->isTss()) {
UID ssId = Codec<UID>::unpack(Tuple::unpack(m.param1.substr(1).removePrefix(tssMappingKeys.begin)));
UID tssId = Codec<UID>::unpack(Tuple::unpack(m.param2));
ASSERT(ssId == data->thisServerID);
data->setSSWithTssPair(tssId);
}
} else {
ASSERT(false); // Unknown private mutation
}
@ -3283,6 +3331,21 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
wait(delayJittered(.005, TaskPriority::TLogPeekReply));
}
if (g_network->isSimulated() && data->isTss() && g_simulator.tssMode == ISimulator::TSSMode::EnabledAddDelay &&
data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now()) {
if (deterministicRandom()->random01() < 0.01) {
TraceEvent(SevWarnAlways, "TSSInjectDelayForever", data->thisServerID);
// small random chance to just completely get stuck here, each tss should eventually hit this in this
// mode
wait(Never());
} else {
// otherwise pause for part of a second
double delayTime = deterministicRandom()->random01();
TraceEvent(SevWarnAlways, "TSSInjectDelay", data->thisServerID).detail("Delay", delayTime);
wait(delay(delayTime));
}
}
while (data->byteSampleClearsTooLarge.get()) {
wait(data->byteSampleClearsTooLarge.onChange());
}
@ -3295,8 +3358,9 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
break;
}
}
if (cursor->popped() > 0)
if (cursor->popped() > 0) {
throw worker_removed();
}
++data->counters.updateBatches;
data->lastTLogVersion = cursor->getMaxKnownVersion();
@ -3347,7 +3411,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
} else {
MutationRef msg;
cloneReader >> msg;
//TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());
// TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
@ -3455,7 +3519,15 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
Span span("SS:update"_loc, { spanContext });
span.addTag("key"_sr, msg.param1);
if (ver != invalidVersion) { // This change belongs to a version < minVersion
if (g_network->isSimulated() && data->isTss() &&
g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations &&
data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now() &&
(msg.type == MutationRef::SetValue || msg.type == MutationRef::ClearRange) && msg.param1.size() &&
msg.param1[0] != 0xff && deterministicRandom()->random01() < 0.05) {
TraceEvent(SevWarnAlways, "TSSInjectDropMutation", data->thisServerID)
.detail("Mutation", msg.toString())
.detail("Version", cloneCursor2->version().toString());
} else if (ver != invalidVersion) { // This change belongs to a version < minVersion
DEBUG_MUTATION("SSPeek", ver, msg).detail("ServerID", data->thisServerID);
if (ver == 1) {
TraceEvent("SSPeekMutation", data->thisServerID);
@ -3519,9 +3591,23 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
if (data->otherError.getFuture().isReady())
data->otherError.getFuture().get();
data->counters.fetchedVersions += (ver - data->version.get());
++data->counters.fetchesFromLogs;
Optional<UID> curSourceTLogID = cursor->getCurrentPeekLocation();
if (curSourceTLogID != data->sourceTLogID) {
data->sourceTLogID = curSourceTLogID;
TraceEvent("StorageServerSourceTLogID", data->thisServerID)
.detail("SourceTLogID",
data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown")
.trackLatest(data->thisServerID.toString() + "/StorageServerSourceTLogID");
}
data->noRecentUpdates.set(false);
data->lastUpdate = now();
data->version.set(ver); // Triggers replies to waiting gets for new version(s)
setDataVersion(data->thisServerID, data->version.get());
if (data->otherError.getFuture().isReady())
data->otherError.getFuture().get();
@ -3683,6 +3769,9 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
void StorageServerDisk::makeNewStorageServerDurable() {
storage->set(persistFormat);
storage->set(KeyValueRef(persistID, BinaryWriter::toValue(data->thisServerID, Unversioned())));
if (data->tssPairID.present()) {
storage->set(KeyValueRef(persistTssPairID, BinaryWriter::toValue(data->tssPairID.get(), Unversioned())));
}
storage->set(KeyValueRef(persistVersion, BinaryWriter::toValue(data->version.get(), Unversioned())));
storage->set(KeyValueRef(persistShardAssignedKeys.begin.toString(), LiteralStringRef("0")));
storage->set(KeyValueRef(persistShardAvailableKeys.begin.toString(), LiteralStringRef("0")));
@ -3911,6 +4000,7 @@ ACTOR Future<Void> restoreByteSample(StorageServer* data,
ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* storage) {
state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
state Future<Optional<Value>> fID = storage->readValue(persistID);
state Future<Optional<Value>> ftssPairID = storage->readValue(persistTssPairID);
state Future<Optional<Value>> fVersion = storage->readValue(persistVersion);
state Future<Optional<Value>> fLogProtocol = storage->readValue(persistLogProtocol);
state Future<Optional<Value>> fPrimaryLocality = storage->readValue(persistPrimaryLocality);
@ -3923,7 +4013,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
restoreByteSample(data, storage, byteSampleSampleRecovered, startByteSampleRestore.getFuture());
TraceEvent("ReadingDurableState", data->thisServerID);
wait(waitForAll(std::vector{ fFormat, fID, fVersion, fLogProtocol, fPrimaryLocality }));
wait(waitForAll(std::vector{ fFormat, fID, ftssPairID, fVersion, fLogProtocol, fPrimaryLocality }));
wait(waitForAll(std::vector{ fShardAssigned, fShardAvailable }));
wait(byteSampleSampleRecovered.getFuture());
TraceEvent("RestoringDurableState", data->thisServerID);
@ -3943,7 +4033,12 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
throw worker_recovery_failed();
}
data->thisServerID = BinaryReader::fromStringRef<UID>(fID.get().get(), Unversioned());
data->sk = serverKeysPrefixFor(data->thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
if (ftssPairID.get().present()) {
data->setTssPair(BinaryReader::fromStringRef<UID>(ftssPairID.get().get(), Unversioned()));
}
data->sk = serverKeysPrefixFor((data->tssPairID.present()) ? data->tssPairID.get() : data->thisServerID)
.withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
if (fLogProtocol.get().present())
data->logProtocol = BinaryReader::fromStringRef<ProtocolVersion>(fLogProtocol.get().get(), Unversioned());
@ -3988,6 +4083,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
wait(yield());
}
// TODO: why is this seemingly random delay here?
wait(delay(0.0001));
{
@ -4235,20 +4331,30 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
wait(self->byteSampleRecovery);
Tag tag = self->tag;
self->actors.add(traceCounters("StorageMetrics",
self->thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&self->counters.cc,
self->thisServerID.toString() + "/StorageMetrics",
[tag, self=self](TraceEvent& te) {
te.detail("Tag", tag.toString());
[self = self](TraceEvent& te) {
te.detail("Tag", self->tag.toString());
StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free);
te.detail("KvstoreBytesAvailable", sb.available);
te.detail("KvstoreBytesTotal", sb.total);
te.detail("KvstoreBytesTemp", sb.temp);
if (self->isTss()) {
te.detail("TSSPairID", self->tssPairID);
te.detail("TSSJointID",
UID(self->thisServerID.first() ^ self->tssPairID.get().first(),
self->thisServerID.second() ^ self->tssPairID.get().second()));
} else if (self->isSSWithTSSPair()) {
te.detail("SSPairID", self->ssPairID);
te.detail("TSSJointID",
UID(self->thisServerID.first() ^ self->ssPairID.get().first(),
self->thisServerID.second() ^ self->ssPairID.get().second()));
}
}));
loop {
@ -4352,6 +4458,7 @@ ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetVa
ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues) {
loop {
GetKeyValuesRequest req = waitNext(getKeyValues);
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
// before doing real work
self->actors.add(self->readGuard(req, getKeyValuesQ));
@ -4649,18 +4756,19 @@ ACTOR Future<Void> memoryStoreRecover(IKeyValueStore* store, Reference<ClusterCo
// create a temp client connect to DB
Database cx = Database::createDatabase(connFile, Database::API_VERSION_LATEST);
state Transaction tr(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state int noCanRemoveCount = 0;
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state bool canRemove = wait(canRemoveStorageServer(&tr, id));
state bool canRemove = wait(canRemoveStorageServer(tr, id));
if (!canRemove) {
TEST(true); // it's possible that the caller had a transaction in flight that assigned keys to the
// server. Wait for it to reverse its mistake.
wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::UpdateStorage));
tr.reset();
tr->reset();
TraceEvent("RemoveStorageServerRetrying")
.detail("Count", noCanRemoveCount++)
.detail("ServerID", id)
@ -4670,21 +4778,28 @@ ACTOR Future<Void> memoryStoreRecover(IKeyValueStore* store, Reference<ClusterCo
}
} catch (Error& e) {
state Error err = e;
wait(tr.onError(e));
wait(tr->onError(e));
TraceEvent("RemoveStorageServerRetrying").error(err);
}
}
}
// for creating a new storage server
ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
StorageServerInterface ssi,
Tag seedTag,
Version tssSeedVersion,
ReplyPromise<InitializeStorageReply> recruitReply,
Reference<AsyncVar<ServerDBInfo>> db,
std::string folder) {
state StorageServer self(persistentData, db, ssi);
if (ssi.isTss()) {
self.setTssPair(ssi.tssPairID.get());
ASSERT(self.isTss());
}
self.sk = serverKeysPrefixFor(self.thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID)
.withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
self.folder = folder;
try {
@ -4695,7 +4810,11 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
std::pair<Version, Tag> verAndTag = wait(addStorageServer(
self.cx, ssi)); // Might throw recruitment_failed in case of simultaneous master failure
self.tag = verAndTag.second;
self.setInitialVersion(verAndTag.first - 1);
if (ssi.isTss()) {
self.setInitialVersion(tssSeedVersion);
} else {
self.setInitialVersion(verAndTag.first - 1);
}
} else {
self.tag = seedTag;
}
@ -4705,12 +4824,14 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
TraceEvent("StorageServerInit", ssi.id())
.detail("Version", self.version.get())
.detail("SeedTag", seedTag.toString());
.detail("SeedTag", seedTag.toString())
.detail("TssPair", ssi.isTss() ? ssi.tssPairID.get().toString() : "");
InitializeStorageReply rep;
rep.interf = ssi;
rep.addedVersion = self.version.get();
recruitReply.send(rep);
self.byteSampleRecovery = Void();
wait(storageServerCore(&self, ssi));
throw internal_error();
@ -4726,6 +4847,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
}
ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface ssi) {
ASSERT(!ssi.isTss());
state Transaction tr(self->cx);
loop {
@ -4740,6 +4862,7 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
GetStorageServerRejoinInfoRequest(ssi.id(), ssi.locality.dcId()))
: Never())) {
state GetStorageServerRejoinInfoReply rep = _rep;
try {
tr.reset();
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@ -4758,6 +4881,7 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
tagLocalityListValue(rep.newTag.get().locality));
}
// this only should happen if SS moved datacenters
if (rep.newTag.present()) {
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(rep.newTag.get()));
tr.addReadConflictRange(conflictRange);
@ -4813,6 +4937,49 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
return Void();
}
ACTOR Future<Void> replaceTSSInterface(StorageServer* self, StorageServerInterface ssi) {
// RYW for KeyBackedMap
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
ASSERT(ssi.isTss());
loop {
try {
state Tag myTag;
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> pairTagValue = wait(tr->get(serverTagKeyFor(self->tssPairID.get())));
if (!pairTagValue.present()) {
TEST(true); // Race where tss was down, pair was removed, tss starts back up
throw worker_removed();
}
myTag = decodeServerTagValue(pairTagValue.get());
tr->addReadConflictRange(singleKeyRange(serverListKeyFor(ssi.id())));
tr->set(serverListKeyFor(ssi.id()), serverListValue(ssi));
// add itself back to tss mapping
tssMapDB.set(tr, self->tssPairID.get(), ssi.id());
wait(tr->commit());
self->tag = myTag;
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
return Void();
}
// for recovering an existing storage server
ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
StorageServerInterface ssi,
Reference<AsyncVar<ServerDBInfo>> db,
@ -4821,7 +4988,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
Reference<ClusterConnectionFile> connFile) {
state StorageServer self(persistentData, db, ssi);
self.folder = folder;
self.sk = serverKeysPrefixFor(self.thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
try {
state double start = now();
TraceEvent("StorageServerRebootStart", self.thisServerID);
@ -4846,13 +5013,30 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
}
TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start);
// if this is a tss storage file, use that as source of truth for this server being a tss instead of the
// presence of the tss pair key in the storage engine
if (ssi.isTss()) {
ASSERT(self.isTss());
ssi.tssPairID = self.tssPairID.get();
} else {
ASSERT(!self.isTss());
}
ASSERT(self.thisServerID == ssi.id());
self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID)
.withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
TraceEvent("StorageServerReboot", self.thisServerID).detail("Version", self.version.get());
if (recovered.canBeSet())
recovered.send(Void());
wait(replaceInterface(&self, ssi));
if (self.isTss()) {
wait(replaceTSSInterface(&self, ssi));
} else {
wait(replaceInterface(&self, ssi));
}
TraceEvent("StorageServerStartingCore", self.thisServerID).detail("TimeTaken", now() - start);

View File

@ -869,6 +869,7 @@ ACTOR Future<Void> checkConsistency(Database cx,
std::vector<TesterInterface> testers,
bool doQuiescentCheck,
bool doCacheCheck,
bool doTSSCheck,
double quiescentWaitTimeout,
double softTimeLimit,
double databasePingDelay,
@ -885,12 +886,16 @@ ACTOR Future<Void> checkConsistency(Database cx,
Standalone<VectorRef<KeyValueRef>> options;
StringRef performQuiescent = LiteralStringRef("false");
StringRef performCacheCheck = LiteralStringRef("false");
StringRef performTSSCheck = LiteralStringRef("false");
if (doQuiescentCheck) {
performQuiescent = LiteralStringRef("true");
}
if (doCacheCheck) {
performCacheCheck = LiteralStringRef("true");
}
if (doTSSCheck) {
performTSSCheck = LiteralStringRef("true");
}
spec.title = LiteralStringRef("ConsistencyCheck");
spec.databasePingDelay = databasePingDelay;
spec.timeout = 32000;
@ -898,6 +903,7 @@ ACTOR Future<Void> checkConsistency(Database cx,
KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("ConsistencyCheck")));
options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performQuiescentChecks"), performQuiescent));
options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performCacheCheck"), performCacheCheck));
options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performTSSCheck"), performTSSCheck));
options.push_back_deep(options.arena(),
KeyValueRef(LiteralStringRef("quiescentWaitTimeout"),
ValueRef(options.arena(), format("%f", quiescentWaitTimeout))));
@ -973,6 +979,7 @@ ACTOR Future<bool> runTest(Database cx,
testers,
quiescent,
spec.runConsistencyCheckOnCache,
spec.runConsistencyCheckOnTSS,
10000.0,
18000,
spec.databasePingDelay,
@ -1108,6 +1115,11 @@ std::map<std::string, std::function<void(const std::string& value, TestSpec* spe
spec->runConsistencyCheckOnCache = (value == "true");
TraceEvent("TestParserTest").detail("ParsedRunConsistencyCheckOnCache", spec->runConsistencyCheckOnCache);
} },
{ "runConsistencyCheckOnTSS",
[](const std::string& value, TestSpec* spec) {
spec->runConsistencyCheckOnTSS = (value == "true");
TraceEvent("TestParserTest").detail("ParsedRunConsistencyCheckOnTSS", spec->runConsistencyCheckOnTSS);
} },
{ "waitForQuiescence",
[](const std::string& value, TestSpec* spec) {
bool toWait = value == "true";
@ -1249,20 +1261,6 @@ std::vector<TestSpec> readTOMLTests_(std::string fileName) {
const toml::value& conf = toml::parse(fileName);
// Handle all global settings
for (const auto& [k, v] : conf.as_table()) {
if (k == "test") {
continue;
}
if (testSpecGlobalKeys.find(k) != testSpecGlobalKeys.end()) {
testSpecGlobalKeys[k](toml_to_string(v));
} else {
TraceEvent(SevError, "TestSpecUnrecognizedGlobalParam")
.detail("Attrib", k)
.detail("Value", toml_to_string(v));
}
}
// Then parse each test
const toml::array& tests = toml::find(conf, "test").as_array();
for (const toml::value& test : tests) {

View File

@ -22,6 +22,7 @@
#include <boost/lexical_cast.hpp>
#include "fdbrpc/Locality.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
@ -139,12 +140,14 @@ Database openDBOnServer(Reference<AsyncVar<ServerDBInfo>> const& db,
bool enableLocalityLoadBalance,
bool lockAware) {
auto info = makeReference<AsyncVar<ClientDBInfo>>();
return DatabaseContext::create(info,
extractClientInfo(db, info),
enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
enableLocalityLoadBalance,
taskID,
lockAware);
auto cx = DatabaseContext::create(info,
extractClientInfo(db, info),
enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
enableLocalityLoadBalance,
taskID,
lockAware);
GlobalConfig::create(cx, db, std::addressof(db->get().client));
return cx;
}
struct ErrorInfo {
@ -272,6 +275,7 @@ ACTOR Future<Void> loadedPonger(FutureStream<LoadedPingRequest> pings) {
}
StringRef fileStoragePrefix = LiteralStringRef("storage-");
StringRef testingStoragePrefix = LiteralStringRef("testingstorage-");
StringRef fileLogDataPrefix = LiteralStringRef("log-");
StringRef fileVersionedLogDataPrefix = LiteralStringRef("log2-");
StringRef fileLogQueuePrefix = LiteralStringRef("logqueue-");
@ -315,6 +319,7 @@ std::string filenameFromSample(KeyValueStoreType storeType, std::string folder,
}
std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std::string prefix, UID id) {
if (storeType == KeyValueStoreType::SSD_BTREE_V1)
return joinPath(folder, prefix + id.toString() + ".fdb");
else if (storeType == KeyValueStoreType::SSD_BTREE_V2)
@ -326,6 +331,7 @@ std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std:
else if (storeType == KeyValueStoreType::SSD_ROCKSDB_V1)
return joinPath(folder, prefix + id.toString() + ".rocksdb");
TraceEvent(SevError, "UnknownStoreType").detail("StoreType", storeType.toString());
UNREACHABLE();
}
@ -444,6 +450,9 @@ std::vector<DiskStore> getDiskStores(std::string folder,
if (filename.startsWith(fileStoragePrefix)) {
store.storedComponent = DiskStore::Storage;
prefix = fileStoragePrefix;
} else if (filename.startsWith(testingStoragePrefix)) {
store.storedComponent = DiskStore::Storage;
prefix = testingStoragePrefix;
} else if (filename.startsWith(fileVersionedLogDataPrefix)) {
store.storedComponent = DiskStore::TLogData;
// Use the option string that's in the file rather than tLogOptions.toPrefix(),
@ -739,6 +748,7 @@ ACTOR Future<Void> storageServerRollbackRebooter(Future<Void> prevStorageServer,
std::string filename,
UID id,
LocalityData locality,
bool isTss,
Reference<AsyncVar<ServerDBInfo>> db,
std::string folder,
ActorCollection* filesClosed,
@ -756,6 +766,9 @@ ACTOR Future<Void> storageServerRollbackRebooter(Future<Void> prevStorageServer,
StorageServerInterface recruited;
recruited.uniqueID = id;
recruited.locality = locality;
recruited.tssPairID =
isTss ? Optional<UID>(UID()) : Optional<UID>(); // set this here since we use its presence to determine
// whether this server is a tss or not
recruited.initEndpoints();
DUMPTOKEN(recruited.getValue);
@ -1097,14 +1110,27 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
Future<Void> kvClosed = kv->onClosed();
filesClosed.add(kvClosed);
// std::string doesn't have startsWith
std::string tssPrefix = testingStoragePrefix.toString();
// TODO might be more efficient to mark a boolean on DiskStore in getDiskStores, but that kind of breaks
// the abstraction since DiskStore also applies to storage cache + tlog
bool isTss = s.filename.find(tssPrefix) != std::string::npos;
Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
StorageServerInterface recruited;
recruited.uniqueID = s.storeID;
recruited.locality = locality;
recruited.tssPairID =
isTss ? Optional<UID>(UID())
: Optional<UID>(); // presence of optional is used as source of truth for tss vs not. Value
// gets overridden later in restoreDurableState
recruited.initEndpoints();
std::map<std::string, std::string> details;
details["StorageEngine"] = s.storeType.toString();
startRole(Role::STORAGE_SERVER, recruited.id(), interf.id(), details, "Restored");
details["IsTSS"] = isTss ? "Yes" : "No";
startRole(ssRole, recruited.id(), interf.id(), details, "Restored");
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
@ -1129,12 +1155,13 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
s.filename,
recruited.id(),
recruited.locality,
isTss,
dbInfo,
folder,
&filesClosed,
memoryLimit,
kv);
errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), f));
errorForwarders.add(forwardError(errors, ssRole, recruited.id(), f));
} else if (s.storedComponent == DiskStore::TLogData) {
std::string logQueueBasename;
const std::string filename = basename(s.filename);
@ -1268,7 +1295,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
notUpdated = interf.updateServerDBInfo.getEndpoint();
} else if (localInfo.infoGeneration > dbInfo->get().infoGeneration ||
dbInfo->get().clusterInterface != ccInterface->get().get()) {
TraceEvent("GotServerDBInfoChange")
.detail("ChangeID", localInfo.id)
.detail("MasterID", localInfo.master.id())
@ -1487,13 +1513,19 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
}
when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) {
if (!storageCache.exists(req.reqId)) {
bool isTss = req.tssPairIDAndVersion.present();
StorageServerInterface recruited(req.interfaceId);
recruited.locality = locality;
recruited.tssPairID = isTss ? req.tssPairIDAndVersion.get().first : Optional<UID>();
recruited.initEndpoints();
std::map<std::string, std::string> details;
details["StorageEngine"] = req.storeType.toString();
startRole(Role::STORAGE_SERVER, recruited.id(), interf.id(), details);
details["IsTSS"] = std::to_string(isTss);
Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
startRole(ssRole, recruited.id(), interf.id(), details);
DUMPTOKEN(recruited.getValue);
DUMPTOKEN(recruited.getKey);
@ -1511,13 +1543,22 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
// printf("Recruited as storageServer\n");
std::string filename =
filenameFromId(req.storeType, folder, fileStoragePrefix.toString(), recruited.id());
filenameFromId(req.storeType,
folder,
isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(),
recruited.id());
IKeyValueStore* data = openKVStore(req.storeType, filename, recruited.id(), memoryLimit);
Future<Void> kvClosed = data->onClosed();
filesClosed.add(kvClosed);
ReplyPromise<InitializeStorageReply> storageReady = req.reply;
storageCache.set(req.reqId, storageReady.getFuture());
Future<Void> s = storageServer(data, recruited, req.seedTag, storageReady, dbInfo, folder);
Future<Void> s = storageServer(data,
recruited,
req.seedTag,
isTss ? req.tssPairIDAndVersion.get().second : 0,
storageReady,
dbInfo,
folder);
s = handleIOErrors(s, data, recruited.id(), kvClosed);
s = storageCache.removeOnReady(req.reqId, s);
s = storageServerRollbackRebooter(s,
@ -1525,12 +1566,13 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
filename,
recruited.id(),
recruited.locality,
isTss,
dbInfo,
folder,
&filesClosed,
memoryLimit,
data);
errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), s));
errorForwarders.add(forwardError(errors, ssRole, recruited.id(), s));
} else
forwardPromise(req.reply, storageCache.get(req.reqId));
}
@ -2047,7 +2089,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
if (coordFolder.size()) {
// SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up
// their files
actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder)));
actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder, coordinators.ccf)));
}
state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));
@ -2111,6 +2153,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
const Role Role::WORKER("Worker", "WK", false);
const Role Role::STORAGE_SERVER("StorageServer", "SS");
const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "ST");
const Role Role::TRANSACTION_LOG("TLog", "TL");
const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false);
const Role Role::COMMIT_PROXY("CommitProxyServer", "CP");

View File

@ -270,6 +270,7 @@ struct ConfigureDatabaseWorkload : TestWorkload {
return Void();
}
state int randomChoice = deterministicRandom()->randomInt(0, 8);
if (randomChoice == 0) {
wait(success(
runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Optional<Value>> {
@ -316,8 +317,14 @@ struct ConfigureDatabaseWorkload : TestWorkload {
} else if (randomChoice == 4) {
//TraceEvent("ConfigureTestQuorumBegin").detail("NewQuorum", s);
auto ch = autoQuorumChange();
std::string desiredClusterName = "NewName%d";
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) {
// if configuration does not allow changing the descriptor, pass empty string (keep old descriptor)
desiredClusterName = "";
}
if (deterministicRandom()->randomInt(0, 2))
ch = nameQuorumChange(format("NewName%d", deterministicRandom()->randomInt(0, 100)), ch);
ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)),
ch);
wait(success(changeQuorum(cx, ch)));
//TraceEvent("ConfigureTestConfigureEnd").detail("NewQuorum", s);
} else if (randomChoice == 5) {

View File

@ -32,6 +32,7 @@
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbserver/TSSMappingUtil.actor.h"
#include "flow/DeterministicRandom.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/StorageServerInterface.h"
@ -48,6 +49,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
// Whether or not perform consistency check between storage cache servers and storage servers
bool performCacheCheck;
// Whether or not to perform consistency check between storage servers and pair TSS
bool performTSSCheck;
// How long to wait for the database to go quiet before failing (if doing quiescent checks)
double quiescentWaitTimeout;
@ -94,6 +98,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
ConsistencyCheckWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
performQuiescentChecks = getOption(options, LiteralStringRef("performQuiescentChecks"), false);
performCacheCheck = getOption(options, LiteralStringRef("performCacheCheck"), false);
performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), true);
quiescentWaitTimeout = getOption(options, LiteralStringRef("quiescentWaitTimeout"), 600.0);
distributed = getOption(options, LiteralStringRef("distributed"), true);
shardSampleFactor = std::max(getOption(options, LiteralStringRef("shardSampleFactor"), 1), 1);
@ -205,11 +210,16 @@ struct ConsistencyCheckWorkload : TestWorkload {
if (self->firstClient || self->distributed) {
try {
state DatabaseConfiguration configuration;
state std::map<UID, StorageServerInterface> tssMapping;
state Transaction tr(cx);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
loop {
try {
if (self->performTSSCheck) {
tssMapping.clear();
wait(readTSSMapping(&tr, &tssMapping));
}
RangeResult res = wait(tr.getRange(configKeys, 1000));
if (res.size() == 1000) {
TraceEvent("ConsistencyCheck_TooManyConfigOptions");
@ -282,7 +292,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
throw;
}
wait(::success(self->checkForStorage(cx, configuration, self)));
wait(::success(self->checkForStorage(cx, configuration, tssMapping, self)));
wait(::success(self->checkForExtraDataStores(cx, self)));
// Check that each machine is operating as its desired class
@ -313,7 +323,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
state Standalone<VectorRef<KeyValueRef>> keyLocations = keyLocationPromise.getFuture().get();
// Check that each shard has the same data on all storage servers that it resides on
wait(::success(self->checkDataConsistency(cx, keyLocations, configuration, self)));
wait(::success(self->checkDataConsistency(cx, keyLocations, configuration, tssMapping, self)));
// Cache consistency check
if (self->performCacheCheck)
@ -1057,7 +1067,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
TraceEvent("ConsistencyCheck_FailedToFetchMetrics")
.detail("Begin", printable(shard.begin))
.detail("End", printable(shard.end))
.detail("StorageServer", storageServers[i].id());
.detail("StorageServer", storageServers[i].id())
.detail("IsTSS", storageServers[i].isTss() ? "True" : "False")
.error(reply.getError());
estimatedBytes.push_back(-1);
}
@ -1074,7 +1086,11 @@ struct ConsistencyCheckWorkload : TestWorkload {
.detail("Begin", printable(shard.begin))
.detail("End", printable(shard.end))
.detail("StorageServer1", storageServers[firstValidStorageServer].id())
.detail("StorageServer2", storageServers[i].id());
.detail("StorageServer2", storageServers[i].id())
.detail("IsTSS",
storageServers[i].isTss() || storageServers[firstValidStorageServer].isTss()
? "True"
: "False");
}
}
}
@ -1114,6 +1130,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
ACTOR Future<bool> checkDataConsistency(Database cx,
VectorRef<KeyValueRef> keyLocations,
DatabaseConfiguration configuration,
std::map<UID, StorageServerInterface> tssMapping,
ConsistencyCheckWorkload* self) {
// Stores the total number of bytes on each storage server
// In a distributed test, this will be an estimated size
@ -1236,6 +1253,19 @@ struct ConsistencyCheckWorkload : TestWorkload {
}
}
// add TSS to end of list, if configured and if not relocating
if (!isRelocating && self->performTSSCheck) {
int initialSize = storageServers.size();
for (int i = 0; i < initialSize; i++) {
auto tssPair = tssMapping.find(storageServers[i]);
if (tssPair != tssMapping.end()) {
TEST(true); // TSS checked in consistency check
storageServers.push_back(tssPair->second.id());
storageServerInterfaces.push_back(tssPair->second);
}
}
}
state vector<int64_t> estimatedBytes = wait(self->getStorageSizeEstimate(storageServerInterfaces, range));
// Gets permitted size range of shard
@ -1323,7 +1353,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
// Be especially verbose if in simulation
if (g_network->isSimulated()) {
int invalidIndex = -1;
printf("\nSERVER %d (%s); shard = %s - %s:\n",
printf("\n%sSERVER %d (%s); shard = %s - %s:\n",
storageServerInterfaces[j].isTss() ? "TSS " : "",
j,
storageServerInterfaces[j].address().toString().c_str(),
printable(req.begin.getKey()).c_str(),
@ -1341,7 +1372,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
}
printf(
"\nSERVER %d (%s); shard = %s - %s:\n",
"\n%sSERVER %d (%s); shard = %s - %s:\n",
storageServerInterfaces[firstValidServer].isTss() ? "TSS " : "",
firstValidServer,
storageServerInterfaces[firstValidServer].address().toString().c_str(),
printable(req.begin.getKey()).c_str(),
@ -1430,16 +1462,29 @@ struct ConsistencyCheckWorkload : TestWorkload {
printable(referenceUniqueKey))
.detail("ValueMismatches", valueMismatches)
.detail("ValueMismatchKey", printable(valueMismatchKey))
.detail("MatchingKVPairs", matchingKVPairs);
.detail("MatchingKVPairs", matchingKVPairs)
.detail("IsTSS",
storageServerInterfaces[j].isTss() ||
storageServerInterfaces[firstValidServer].isTss()
? "True"
: "False");
self->testFailure("Data inconsistent", true);
return false;
if ((g_network->isSimulated() &&
g_simulator.tssMode != ISimulator::TSSMode::EnabledDropMutations) ||
(!storageServerInterfaces[j].isTss() &&
!storageServerInterfaces[firstValidServer].isTss())) {
self->testFailure("Data inconsistent", true);
return false;
}
}
}
}
// If the data is not available and we aren't relocating this shard
else if (!isRelocating) {
Error e =
rangeResult.isError() ? rangeResult.getError() : rangeResult.get().error.get();
TraceEvent("ConsistencyCheck_StorageServerUnavailable")
.suppressFor(1.0)
.detail("StorageServer", storageServers[j])
@ -1448,10 +1493,15 @@ struct ConsistencyCheckWorkload : TestWorkload {
.detail("Address", storageServerInterfaces[j].address())
.detail("UID", storageServerInterfaces[j].id())
.detail("GetKeyValuesToken",
storageServerInterfaces[j].getKeyValues.getEndpoint().token);
storageServerInterfaces[j].getKeyValues.getEndpoint().token)
.detail("IsTSS", storageServerInterfaces[j].isTss() ? "True" : "False")
.error(e);
// All shards should be available in quiscence
if (self->performQuiescentChecks) {
if (self->performQuiescentChecks &&
((g_network->isSimulated() &&
g_simulator.tssMode != ISimulator::TSSMode::EnabledAddDelay) ||
!storageServerInterfaces[j].isTss())) {
self->testFailure("Storage server unavailable");
return false;
}
@ -1552,13 +1602,18 @@ struct ConsistencyCheckWorkload : TestWorkload {
TraceEvent("ConsistencyCheck_IncorrectEstimate")
.detail("EstimatedBytes", estimatedBytes[j])
.detail("CorrectSampledBytes", sampledBytes)
.detail("StorageServer", storageServers[j]);
self->testFailure("Storage servers had incorrect sampled estimate");
.detail("StorageServer", storageServers[j])
.detail("IsTSS", storageServerInterfaces[j].isTss() ? "True" : "False");
if (!storageServerInterfaces[j].isTss()) {
self->testFailure("Storage servers had incorrect sampled estimate");
}
hasValidEstimate = false;
break;
} else if (estimatedBytes[j] < 0) {
} else if (estimatedBytes[j] < 0 &&
(g_network->isSimulated() || !storageServerInterfaces[j].isTss())) {
self->testFailure("Could not get storage metrics from server");
hasValidEstimate = false;
break;
@ -1670,7 +1725,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
if (!keyValueStoreType.present()) {
TraceEvent("ConsistencyCheck_ServerUnavailable").detail("ServerID", storageServers[i].id());
self->testFailure("Storage server unavailable");
} else if (keyValueStoreType.get() != configuration.storageServerStoreType) {
} else if ((!storageServers[i].isTss() &&
keyValueStoreType.get() != configuration.storageServerStoreType) ||
(storageServers[i].isTss() &&
keyValueStoreType.get() != configuration.testingStorageServerStoreType)) {
TraceEvent("ConsistencyCheck_WrongKeyValueStoreType")
.detail("ServerID", storageServers[i].id())
.detail("StoreType", keyValueStoreType.get().toString())
@ -1698,10 +1756,11 @@ struct ConsistencyCheckWorkload : TestWorkload {
// Returns false if any worker that should have a storage server does not have one
ACTOR Future<bool> checkForStorage(Database cx,
DatabaseConfiguration configuration,
std::map<UID, StorageServerInterface> tssMapping,
ConsistencyCheckWorkload* self) {
state vector<WorkerDetails> workers = wait(getWorkers(self->dbInfo));
state vector<StorageServerInterface> storageServers = wait(getStorageServers(cx));
std::set<Optional<Key>> missingStorage;
std::vector<Optional<Key>> missingStorage; // vector instead of a set to get the count
for (int i = 0; i < workers.size(); i++) {
NetworkAddress addr = workers[i].interf.stableAddress();
@ -1720,21 +1779,48 @@ struct ConsistencyCheckWorkload : TestWorkload {
.detail("Address", addr)
.detail("ProcessClassEqualToStorageClass",
(int)(workers[i].processClass == ProcessClass::StorageClass));
missingStorage.insert(workers[i].interf.locality.dcId());
missingStorage.push_back(workers[i].interf.locality.dcId());
}
}
}
int missingDc0 = configuration.regions.size() == 0
? 0
: std::count(missingStorage.begin(), missingStorage.end(), configuration.regions[0].dcId);
int missingDc1 = configuration.regions.size() < 2
? 0
: std::count(missingStorage.begin(), missingStorage.end(), configuration.regions[1].dcId);
if ((configuration.regions.size() == 0 && missingStorage.size()) ||
(configuration.regions.size() == 1 && missingStorage.count(configuration.regions[0].dcId)) ||
(configuration.regions.size() == 2 && configuration.usableRegions == 1 &&
missingStorage.count(configuration.regions[0].dcId) &&
missingStorage.count(configuration.regions[1].dcId)) ||
(configuration.regions.size() == 2 && configuration.usableRegions > 1 &&
(missingStorage.count(configuration.regions[0].dcId) ||
missingStorage.count(configuration.regions[1].dcId)))) {
self->testFailure("No storage server on worker");
return false;
(configuration.regions.size() == 1 && missingDc0) ||
(configuration.regions.size() == 2 && configuration.usableRegions == 1 && missingDc0 && missingDc1) ||
(configuration.regions.size() == 2 && configuration.usableRegions > 1 && (missingDc0 || missingDc1))) {
// TODO could improve this check by also ensuring DD is currently recruiting a TSS by using quietdb?
bool couldExpectMissingTss = (configuration.desiredTSSCount - tssMapping.size()) > 0;
int countMissing = missingStorage.size();
int acceptableTssMissing = 1;
if (configuration.regions.size() == 1) {
countMissing = missingDc0;
} else if (configuration.regions.size() == 2) {
if (configuration.usableRegions == 1) {
// all processes should be missing from 1, so take the number missing from the other
countMissing = std::min(missingDc0, missingDc1);
} else if (configuration.usableRegions == 2) {
countMissing = missingDc0 + missingDc1;
acceptableTssMissing = 2;
} else {
ASSERT(false); // in case fdb ever adds 3+ region support?
}
}
if (!couldExpectMissingTss || countMissing > acceptableTssMissing) {
self->testFailure("No storage server on worker");
return false;
} else {
TraceEvent(SevWarn, "ConsistencyCheck_TSSMissing");
}
}
return true;

View File

@ -162,12 +162,13 @@ struct MoveKeysWorkload : TestWorkload {
// The real data distribution algorithm doesn't want to deal with multiple servers
// with the same address having keys. So if there are two servers with the same address,
// don't use either one (so we don't have to find out which of them, if any, already has keys).
// Also get rid of tss since we don't want to move a shard to a tss.
std::map<NetworkAddress, int> count;
for (int s = 0; s < servers.size(); s++)
count[servers[s].address()]++;
int o = 0;
for (int s = 0; s < servers.size(); s++)
if (count[servers[s].address()] == 1)
if (count[servers[s].address()] == 1 && !servers[s].isTss())
servers[o++] = servers[s];
servers.resize(o);
}

View File

@ -624,7 +624,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
ACTOR Future<Void> managementApiCorrectnessActor(Database cx_, SpecialKeySpaceCorrectnessWorkload* self) {
// All management api related tests
Database cx = cx_->clone();
state Database cx = cx_->clone();
state Reference<ReadYourWritesTransaction> tx = makeReference<ReadYourWritesTransaction>(cx);
// test ordered option keys
{
@ -936,7 +936,10 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
// test change coordinators and cluster description
// we randomly pick one process(not coordinator) and add it, in this case, it should always succeed
{
state std::string new_cluster_description = deterministicRandom()->randomAlphaNumeric(8);
// choose a new description if configuration allows transactions across differently named clusters
state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT
? deterministicRandom()->randomAlphaNumeric(8)
: cs.clusterKeyName().toString();
state std::string new_coordinator_process;
state std::vector<std::string> old_coordinators_processes;
state bool possible_to_add_coordinator;
@ -1426,6 +1429,40 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
}
}
}
// make sure when we change dd related special keys, we grab the two system keys,
// i.e. moveKeysLockOwnerKey and moveKeysLockWriteKey
{
state Reference<ReadYourWritesTransaction> tr1(new ReadYourWritesTransaction(cx));
state Reference<ReadYourWritesTransaction> tr2(new ReadYourWritesTransaction(cx));
loop {
try {
Version readVersion = wait(tr1->getReadVersion());
tr2->setVersion(readVersion);
tr1->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr2->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution");
tr1->set(LiteralStringRef("mode").withPrefix(ddPrefix), LiteralStringRef("1"));
wait(tr1->commit());
// randomly read the moveKeysLockOwnerKey/moveKeysLockWriteKey
// both of them should be grabbed when changing dd mode
wait(success(
tr2->get(deterministicRandom()->coinflip() ? moveKeysLockOwnerKey : moveKeysLockWriteKey)));
// tr2 shoulde never succeed, just write to a key to make it not a read-only transaction
tr2->set(LiteralStringRef("unused_key"), LiteralStringRef(""));
wait(tr2->commit());
ASSERT(false); // commit should always fail due to conflict
} catch (Error& e) {
if (e.code() != error_code_not_committed) {
// when buggify is enabled, it's possible we get other retriable errors
wait(tr2->onError(e));
tr1->reset();
} else {
// loop until we get conflict error
break;
}
}
}
}
return Void();
}
};

View File

@ -734,7 +734,9 @@ ACTOR Future<Void> randomTransaction(Database cx, WriteDuringReadWorkload* self,
state bool readAheadDisabled = deterministicRandom()->random01() < 0.5;
state bool snapshotRYWDisabled = deterministicRandom()->random01() < 0.5;
state bool useBatchPriority = deterministicRandom()->random01() < 0.5;
state int64_t timebomb = deterministicRandom()->random01() < 0.01 ? deterministicRandom()->randomInt64(1, 6000) : 0;
state int64_t timebomb = (FLOW_KNOBS->MAX_BUGGIFIED_DELAY == 0.0 && deterministicRandom()->random01() < 0.01)
? deterministicRandom()->randomInt64(1, 6000)
: 0; // timebomb check can fail incorrectly if simulation injects delay longer than the timebomb
state std::vector<Future<Void>> operations;
state ActorCollection commits(false);
state std::vector<Future<Void>> watches;

View File

@ -152,6 +152,7 @@ public:
databasePingDelay = g_network->isSimulated() ? 0.0 : 15.0;
runConsistencyCheck = g_network->isSimulated();
runConsistencyCheckOnCache = false;
runConsistencyCheckOnTSS = true;
waitForQuiescenceBegin = true;
waitForQuiescenceEnd = true;
simCheckRelocationDuration = false;
@ -167,8 +168,8 @@ public:
double databasePingDelay = -1.0)
: title(title), dumpAfterTest(dump), clearAfterTest(clear), startDelay(startDelay), useDB(useDB), timeout(600),
databasePingDelay(databasePingDelay), runConsistencyCheck(g_network->isSimulated()),
runConsistencyCheckOnCache(false), waitForQuiescenceBegin(true), waitForQuiescenceEnd(true),
simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0),
runConsistencyCheckOnCache(false), runConsistencyCheckOnTSS(false), waitForQuiescenceBegin(true),
waitForQuiescenceEnd(true), simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0),
simBackupAgents(ISimulator::BackupAgentType::NoBackupAgents),
simDrAgents(ISimulator::BackupAgentType::NoBackupAgents) {
phases = TestWorkload::SETUP | TestWorkload::EXECUTION | TestWorkload::CHECK | TestWorkload::METRICS;
@ -187,6 +188,7 @@ public:
double databasePingDelay;
bool runConsistencyCheck;
bool runConsistencyCheckOnCache;
bool runConsistencyCheckOnTSS;
bool waitForQuiescenceBegin;
bool waitForQuiescenceEnd;

View File

@ -101,6 +101,11 @@ void Arena::dependsOn(const Arena& p) {
}
}
}
void* Arena::allocate4kAlignedBuffer(uint32_t size) {
return ArenaBlock::dependOn4kAlignedBuffer(impl, size);
}
size_t Arena::getSize() const {
if (impl) {
allowAccess(impl.getPtr());
@ -172,9 +177,13 @@ size_t ArenaBlock::totalSize() {
while (o) {
ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o);
makeDefined(r, sizeof(ArenaBlockRef));
allowAccess(r->next);
s += r->next->totalSize();
disallowAccess(r->next);
if (r->aligned4kBufferSize != 0) {
s += r->aligned4kBufferSize;
} else {
allowAccess(r->next);
s += r->next->totalSize();
disallowAccess(r->next);
}
o = r->nextBlockOffset;
makeNoAccess(r, sizeof(ArenaBlockRef));
}
@ -190,7 +199,12 @@ void ArenaBlock::getUniqueBlocks(std::set<ArenaBlock*>& a) {
while (o) {
ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o);
makeDefined(r, sizeof(ArenaBlockRef));
r->next->getUniqueBlocks(a);
// If next is valid recursively count its blocks
if (r->aligned4kBufferSize == 0) {
r->next->getUniqueBlocks(a);
}
o = r->nextBlockOffset;
makeNoAccess(r, sizeof(ArenaBlockRef));
}
@ -212,6 +226,7 @@ int ArenaBlock::addUsed(int bytes) {
void ArenaBlock::makeReference(ArenaBlock* next) {
ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
makeDefined(r, sizeof(ArenaBlockRef));
r->aligned4kBufferSize = 0;
r->next = next;
r->nextBlockOffset = nextBlockOffset;
makeNoAccess(r, sizeof(ArenaBlockRef));
@ -219,6 +234,20 @@ void ArenaBlock::makeReference(ArenaBlock* next) {
bigUsed += sizeof(ArenaBlockRef);
}
void* ArenaBlock::make4kAlignedBuffer(uint32_t size) {
ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
makeDefined(r, sizeof(ArenaBlockRef));
r->aligned4kBufferSize = size;
r->aligned4kBuffer = allocateFast4kAligned(size);
// printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer);
r->nextBlockOffset = nextBlockOffset;
auto result = r->aligned4kBuffer;
makeNoAccess(r, sizeof(ArenaBlockRef));
nextBlockOffset = bigUsed;
bigUsed += sizeof(ArenaBlockRef);
return result;
}
void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
other->addref();
if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef))
@ -227,6 +256,14 @@ void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
self->makeReference(other);
}
void* ArenaBlock::dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, uint32_t size) {
if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) {
return create(SMALL, self)->make4kAlignedBuffer(size);
} else {
return self->make4kAlignedBuffer(size);
}
}
void* ArenaBlock::allocate(Reference<ArenaBlock>& self, int bytes) {
ArenaBlock* b = self.getPtr();
allowAccess(b);
@ -359,10 +396,18 @@ void ArenaBlock::destroy() {
while (o) {
ArenaBlockRef* br = (ArenaBlockRef*)((char*)b->getData() + o);
makeDefined(br, sizeof(ArenaBlockRef));
allowAccess(br->next);
if (br->next->delref_no_destroy())
stack.push_back(stackArena, br->next);
disallowAccess(br->next);
// If aligned4kBuffer is valid, free it
if (br->aligned4kBufferSize != 0) {
// printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer);
freeFast4kAligned(br->aligned4kBufferSize, br->aligned4kBuffer);
} else {
allowAccess(br->next);
if (br->next->delref_no_destroy())
stack.push_back(stackArena, br->next);
disallowAccess(br->next);
}
o = br->nextBlockOffset;
}
}

View File

@ -102,6 +102,7 @@ public:
Arena& operator=(Arena&&) noexcept;
void dependsOn(const Arena& p);
void* allocate4kAlignedBuffer(uint32_t size);
size_t getSize() const;
bool hasFree(size_t size, const void* address);
@ -129,7 +130,15 @@ struct scalar_traits<Arena> : std::true_type {
};
struct ArenaBlockRef {
ArenaBlock* next;
union {
ArenaBlock* next;
void* aligned4kBuffer;
};
// Only one of (next, aligned4kBuffer) is valid at any one time, as they occupy the same space.
// If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid.
uint32_t aligned4kBufferSize;
uint32_t nextBlockOffset;
};
@ -160,7 +169,9 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted<ArenaBlock> {
void getUniqueBlocks(std::set<ArenaBlock*>& a);
int addUsed(int bytes);
void makeReference(ArenaBlock* next);
void* make4kAlignedBuffer(uint32_t size);
static void dependOn(Reference<ArenaBlock>& self, ArenaBlock* other);
static void* dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, uint32_t size);
static void* allocate(Reference<ArenaBlock>& self, int bytes);
// Return an appropriately-sized ArenaBlock to store the given data
static ArenaBlock* create(int dataSize, Reference<ArenaBlock>& next);

View File

@ -266,4 +266,26 @@ inline void freeFast(int size, void* ptr) {
delete[](uint8_t*) ptr;
}
[[nodiscard]] inline void* allocateFast4kAligned(int size) {
// Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc
if (size <= 4096)
return FastAllocator<4096>::allocate();
if (size <= 8192)
return FastAllocator<8192>::allocate();
if (size <= 16384)
return FastAllocator<16384>::allocate();
return aligned_alloc(4096, size);
}
inline void freeFast4kAligned(int size, void* ptr) {
// Sizes supported by FastAllocator must be release via FastAllocator
if (size <= 4096)
return FastAllocator<4096>::release(ptr);
if (size <= 8192)
return FastAllocator<8192>::release(ptr);
if (size <= 16384)
return FastAllocator<16384>::release(ptr);
aligned_free(ptr);
}
#endif

View File

@ -234,6 +234,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
init( BASIC_LOAD_BALANCE_MIN_CPU, 0.05 ); //do not adjust LB probabilities if the proxies are less than 5% utilized
init( BASIC_LOAD_BALANCE_BUCKETS, 40 ); //proxies bin recent GRV requests into 40 time bins
init( BASIC_LOAD_BALANCE_COMPUTE_PRECISION, 10000 ); //determines how much of the LB usage is holding the CPU usage of the proxy
init( LOAD_BALANCE_TSS_TIMEOUT, 5.0 );
// Health Monitor
init( FAILURE_DETECTION_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_DETECTION_DELAY = 1.0;

View File

@ -250,6 +250,7 @@ public:
int BASIC_LOAD_BALANCE_COMPUTE_PRECISION;
double BASIC_LOAD_BALANCE_MIN_REQUESTS;
double BASIC_LOAD_BALANCE_MIN_CPU;
double LOAD_BALANCE_TSS_TIMEOUT;
// Health Monitor
int FAILURE_DETECTION_DELAY;

View File

@ -121,7 +121,7 @@ public: // introduced features
PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, CloseUnusedConnection);
PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, DBCoreState);
PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, TagThrottleValue);
PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ServerListValue);
PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, ServerListValue);
PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, StorageCacheValue);
PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreStatusValue);
PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreRequestValue);
@ -138,6 +138,7 @@ public: // introduced features
PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, StableInterfaces);
PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason);
PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext);
PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TSS);
};
template <>

View File

@ -74,6 +74,7 @@ ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
ERROR( batch_transaction_throttled, 1051, "Batch GRV request rate limit exceeded")
ERROR( dd_cancelled, 1052, "Data distribution components cancelled")
ERROR( dd_not_found, 1053, "Data distributor not found")
ERROR( wrong_connection_file, 1054, "Connection file mismatch")
ERROR( broken_promise, 1100, "Broken promise" )
ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )

View File

@ -674,6 +674,8 @@ public:
bool isValid() const { return sav != 0; }
bool isReady() const { return sav->isSet(); }
bool isError() const { return sav->isError(); }
// returns true if get can be called on this future (counterpart of canBeSet on Promises)
bool canGet() const { return isValid() && isReady() && !isError(); }
Error& getError() const {
ASSERT(isError());
return sav->error_state;

View File

@ -697,6 +697,16 @@ private:
AsyncVar<Void> v;
};
// Binds an AsyncTrigger object to an AsyncVar, so when the AsyncVar changes
// the AsyncTrigger is triggered.
ACTOR template <class T>
void forward(Reference<AsyncVar<T>> from, AsyncTrigger* to) {
loop {
wait(from->onChange());
to->trigger();
}
}
class Debouncer : NonCopyable {
public:
explicit Debouncer(double delay) { worker = debounceWorker(this, delay); }
@ -1334,6 +1344,14 @@ struct FlowLock : NonCopyable, public ReferenceCounted<FlowLock> {
int64_t activePermits() const { return active; }
int waiters() const { return takers.size(); }
// Try to send error to all current and future waiters
// Only works if broken_on_destruct.canBeSet()
void kill(Error e = broken_promise()) {
if (broken_on_destruct.canBeSet()) {
broken_on_destruct.sendError(e);
}
}
private:
std::list<std::pair<Promise<Void>, int64_t>> takers;
const int64_t permits;
@ -1891,6 +1909,59 @@ Future<U> operator>>(Future<T> const& lhs, Future<U> const& rhs) {
return runAfter(lhs, rhs);
}
// A weak reference type to wrap a future Reference<T> object.
// Once the future is complete, this object holds a pointer to the referenced object but does
// not contribute to its reference count.
//
// WARNING: this class will not be aware when the underlying object is destroyed. It is up to the
// user to make sure that an UnsafeWeakFutureReference is discarded at the same time the object is.
template <class T>
class UnsafeWeakFutureReference {
public:
UnsafeWeakFutureReference() {}
UnsafeWeakFutureReference(Future<Reference<T>> future) : data(new UnsafeWeakFutureReferenceData(future)) {}
// Returns a future to obtain a normal reference handle
// If the future is ready, this creates a Reference<T> to wrap the object
Future<Reference<T>> get() {
if (!data) {
return Reference<T>();
} else if (data->ptr.present()) {
return Reference<T>::addRef(data->ptr.get());
} else {
return data->future;
}
}
// Returns the raw pointer, if the object is ready
// Note: this should be used with care, as this pointer is not counted as a reference to the object and
// it could be deleted if all normal references are destroyed.
Optional<T*> getPtrIfReady() { return data->ptr; }
private:
// A class to hold the state for an UnsafeWeakFutureReference
struct UnsafeWeakFutureReferenceData : public ReferenceCounted<UnsafeWeakFutureReferenceData>, NonCopyable {
Optional<T*> ptr;
Future<Reference<T>> future;
Future<Void> moveResultFuture;
UnsafeWeakFutureReferenceData(Future<Reference<T>> future) : future(future) {
moveResultFuture = moveResult(this);
}
// Waits for the future to complete and then stores the pointer in local storage
// When this completes, we will no longer be counted toward the reference count of the object
ACTOR Future<Void> moveResult(UnsafeWeakFutureReferenceData* self) {
Reference<T> result = wait(self->future);
self->ptr = result.getPtr();
self->future = Future<Reference<T>>();
return Void();
}
};
Reference<UnsafeWeakFutureReferenceData> data;
};
#include "flow/unactorcompiler.h"
#endif

View File

@ -87,7 +87,9 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES SlowTask.txt IGNORE)
add_fdb_test(TEST_FILES SpecificUnitTest.txt IGNORE)
add_fdb_test(TEST_FILES StorageMetricsSampleTests.txt IGNORE)
add_fdb_test(TEST_FILES StorageServerInterface.txt)
add_fdb_test(TEST_FILES StreamingWrite.txt IGNORE)
add_fdb_test(TEST_FILES SystemData.txt)
add_fdb_test(TEST_FILES ThreadSafety.txt IGNORE)
add_fdb_test(TEST_FILES TraceEventMetrics.txt IGNORE)
add_fdb_test(TEST_FILES PopulateTPCC.txt IGNORE)
@ -186,17 +188,17 @@ if(WITH_PYTHON)
TEST_FILES restarting/from_5.0.0/StorefrontTestRestart-1.txt
restarting/from_5.0.0/StorefrontTestRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.29/SnapTestAttrition-1.txt
restarting/from_6.2.29/SnapTestAttrition-2.txt)
TEST_FILES restarting/from_6.2.33/SnapTestAttrition-1.txt
restarting/from_6.2.33/SnapTestAttrition-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.29/SnapTestSimpleRestart-1.txt
restarting/from_6.2.29/SnapTestSimpleRestart-2.txt)
TEST_FILES restarting/from_6.2.33/SnapTestSimpleRestart-1.txt
restarting/from_6.2.33/SnapTestSimpleRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.29/SnapTestRestart-1.txt
restarting/from_6.2.29/SnapTestRestart-2.txt)
TEST_FILES restarting/from_6.2.33/SnapTestRestart-1.txt
restarting/from_6.2.33/SnapTestRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_6.2.29/SnapCycleRestart-1.txt
restarting/from_6.2.29/SnapCycleRestart-2.txt)
TEST_FILES restarting/from_6.2.33/SnapCycleRestart-1.txt
restarting/from_6.2.33/SnapCycleRestart-2.txt)
add_fdb_test(
TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt
restarting/from_5.1.7/DrUpgradeRestart-2.txt)

View File

@ -0,0 +1,7 @@
testTitle=UnitTests
startDelay=0
useDB=false
testName=UnitTests
maxTestCases=0
testsMatching=/StorageServerInterface/

7
tests/SystemData.txt Normal file
View File

@ -0,0 +1,7 @@
testTitle=UnitTests
startDelay=0
useDB=false
testName=UnitTests
maxTestCases=0
testsMatching=/SystemData/

View File

@ -1,3 +1,4 @@
[configuration]
extraDB = 1
[[test]]

View File

@ -1,3 +1,4 @@
[configuration]
extraDB = 1
[[test]]

View File

@ -1,3 +1,4 @@
[configuration]
extraDB = 1
[[test]]

View File

@ -1,4 +1,5 @@
configureLocked = 1
[configuration]
configureLocked = true
[[test]]
testTitle = 'ConfigureLocked'

Some files were not shown because too many files have changed in this diff Show More