Merge branch 'master' of github.com:apple/foundationdb into ipc-bench

This commit is contained in:
Chaoguang Lin 2021-08-19 21:40:27 +00:00
commit 3e34769c6d
158 changed files with 6341 additions and 2475 deletions

2
.gitignore vendored
View File

@ -8,7 +8,7 @@ bindings/java/foundationdb-client*.jar
bindings/java/foundationdb-tests*.jar
bindings/java/fdb-java-*-sources.jar
packaging/msi/FDBInstaller.msi
builds/
build/
cmake-build-debug/
# Generated source, build, and packaging files
*.g.cpp

View File

@ -157,6 +157,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
endif()
include(CompileBoost)
include(GetMsgpack)
add_subdirectory(flow)
add_subdirectory(fdbrpc)
add_subdirectory(fdbclient)
@ -171,7 +172,7 @@ add_subdirectory(fdbbackup)
add_subdirectory(contrib)
add_subdirectory(tests)
add_subdirectory(flowbench EXCLUDE_FROM_ALL)
if(WITH_PYTHON)
if(WITH_PYTHON AND WITH_C_BINDING)
add_subdirectory(bindings)
endif()
if(WITH_DOCUMENTATION)

View File

@ -3,14 +3,16 @@ if(NOT OPEN_FOR_IDE)
add_subdirectory(c)
add_subdirectory(flow)
endif()
add_subdirectory(python)
if(WITH_JAVA)
if(WITH_PYTHON_BINDING)
add_subdirectory(python)
endif()
if(WITH_JAVA_BINDING)
add_subdirectory(java)
endif()
if(WITH_GO AND NOT OPEN_FOR_IDE)
if(WITH_GO_BINDING AND NOT OPEN_FOR_IDE)
add_subdirectory(go)
endif()
if(WITH_RUBY)
if(WITH_RUBY_BINDING)
add_subdirectory(ruby)
endif()
if(NOT WIN32 AND NOT OPEN_FOR_IDE)

View File

@ -189,10 +189,19 @@ int cleanup(FDBTransaction* transaction, mako_args_t* args) {
free(prefixstr);
len += 1;
retryTxn:
clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_start);
fdb_transaction_clear_range(transaction, (uint8_t*)beginstr, len + 1, (uint8_t*)endstr, len + 1);
if (commit_transaction(transaction) != FDB_SUCCESS)
goto failExit;
switch (commit_transaction(transaction)) {
case (FDB_SUCCESS):
break;
case (FDB_ERROR_RETRY):
fdb_transaction_reset(transaction);
goto retryTxn;
default:
goto failExit;
}
fdb_transaction_reset(transaction);
clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_end);
@ -308,11 +317,19 @@ int populate(FDBTransaction* transaction,
/* commit every 100 inserts (default) */
if (i % args->txnspec.ops[OP_INSERT][OP_COUNT] == 0) {
retryTxn:
if (stats->xacts % args->sampling == 0) {
clock_gettime(CLOCK_MONOTONIC, &timer_start_commit);
}
if (commit_transaction(transaction) != FDB_SUCCESS)
goto failExit;
switch (commit_transaction(transaction)) {
case (FDB_SUCCESS):
break;
case (FDB_ERROR_RETRY):
goto retryTxn;
default:
goto failExit;
}
/* xact latency stats */
if (stats->xacts % args->sampling == 0) {
@ -337,20 +354,36 @@ int populate(FDBTransaction* transaction,
xacts++; /* for throttling */
}
}
if (stats->xacts % args->sampling == 0) {
clock_gettime(CLOCK_MONOTONIC, &timer_start_commit);
}
if (commit_transaction(transaction) != FDB_SUCCESS)
goto failExit;
/* xact latency stats */
if (stats->xacts % args->sampling == 0) {
clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end);
update_op_lat_stats(
&timer_start_commit, &timer_per_xact_end, OP_COMMIT, stats, block, elem_size, is_memory_allocated);
update_op_lat_stats(
&timer_per_xact_start, &timer_per_xact_end, OP_TRANSACTION, stats, block, elem_size, is_memory_allocated);
time_t start_time_sec, current_time_sec;
time(&start_time_sec);
int is_committed = false;
// will hit FDB_ERROR_RETRY if running mako with multi-version client
while (!is_committed) {
if (stats->xacts % args->sampling == 0) {
clock_gettime(CLOCK_MONOTONIC, &timer_start_commit);
}
int rc;
if ((rc = commit_transaction(transaction) != FDB_SUCCESS)) {
if (rc == FDB_ERROR_RETRY) {
time(&current_time_sec);
if (difftime(current_time_sec, start_time_sec) > 5) {
goto failExit;
} else {
continue;
}
} else {
goto failExit;
}
}
is_committed = true;
/* xact latency stats */
if (stats->xacts % args->sampling == 0) {
clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end);
update_op_lat_stats(
&timer_start_commit, &timer_per_xact_end, OP_COMMIT, stats, block, elem_size, is_memory_allocated);
update_op_lat_stats(
&timer_per_xact_start, &timer_per_xact_end, OP_TRANSACTION, stats, block, elem_size, is_memory_allocated);
}
}
clock_gettime(CLOCK_MONOTONIC, &timer_end);

View File

@ -77,19 +77,37 @@ add_dependencies(packages python_package)
if (NOT WIN32 AND NOT OPEN_FOR_IDE)
add_fdbclient_test(
NAME fdbcli_tests
NAME single_process_fdbcli_tests
COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py
${CMAKE_BINARY_DIR}/bin/fdbcli
${CMAKE_BINARY_DIR}
@CLUSTER_FILE@
1
)
add_fdbclient_test(
NAME multi_process_fdbcli_tests
PROCESS_NUMBER 5
TEST_TIMEOUT 120 # The test can take near to 1 minutes sometime, set timeout to 2 minutes to be safe
COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py
${CMAKE_BINARY_DIR}/bin/fdbcli
${CMAKE_BINARY_DIR}
@CLUSTER_FILE@
5
)
if (TARGET external_client) # external_client copies fdb_c to bindings/c/libfdb_c.so
add_fdbclient_test(
NAME single_process_external_client_fdbcli_tests
COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py
${CMAKE_BINARY_DIR}
@CLUSTER_FILE@
--external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c.so
)
add_fdbclient_test(
NAME multi_process_external_client_fdbcli_tests
PROCESS_NUMBER 5
TEST_TIMEOUT 120 # The test can take near to 1 minutes sometime, set timeout to 2 minutes to be safe
COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py
${CMAKE_BINARY_DIR}
@CLUSTER_FILE@
5
--external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c.so
)
endif()
endif()

View File

@ -1,14 +1,17 @@
#!/usr/bin/env python3
import sys
import os
import subprocess
import logging
import functools
import json
import time
import random
from argparse import ArgumentParser, RawDescriptionHelpFormatter
def enable_logging(level=logging.ERROR):
def enable_logging(level=logging.DEBUG):
"""Enable logging in the function with the specified logging level
Args:
@ -16,7 +19,7 @@ def enable_logging(level=logging.ERROR):
"""
def func_decorator(func):
@functools.wraps(func)
def wrapper(*args,**kwargs):
def wrapper(*args, **kwargs):
# initialize logger
logger = logging.getLogger(func.__name__)
logger.setLevel(level)
@ -32,6 +35,7 @@ def enable_logging(level=logging.ERROR):
return wrapper
return func_decorator
def run_fdbcli_command(*args):
"""run the fdbcli statement: fdbcli --exec '<arg1> <arg2> ... <argN>'.
@ -39,7 +43,8 @@ def run_fdbcli_command(*args):
string: Console output from fdbcli
"""
commands = command_template + ["{}".format(' '.join(args))]
return subprocess.run(commands, stdout=subprocess.PIPE).stdout.decode('utf-8').strip()
return subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env).stdout.decode('utf-8').strip()
def run_fdbcli_command_and_get_error(*args):
"""run the fdbcli statement: fdbcli --exec '<arg1> <arg2> ... <argN>'.
@ -48,7 +53,8 @@ def run_fdbcli_command_and_get_error(*args):
string: Stderr output from fdbcli
"""
commands = command_template + ["{}".format(' '.join(args))]
return subprocess.run(commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.decode('utf-8').strip()
return subprocess.run(commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env).stderr.decode('utf-8').strip()
@enable_logging()
def advanceversion(logger):
@ -72,6 +78,7 @@ def advanceversion(logger):
logger.debug("Read version: {}".format(version4))
assert version4 >= version3
@enable_logging()
def maintenance(logger):
# expected fdbcli output when running 'maintenance' while there's no ongoing maintenance
@ -94,6 +101,7 @@ def maintenance(logger):
output3 = run_fdbcli_command('maintenance')
assert output3 == no_maintenance_output
@enable_logging()
def setclass(logger):
output1 = run_fdbcli_command('setclass')
@ -108,11 +116,11 @@ def setclass(logger):
# check class source
assert 'command_line' in class_type_line_1
# set class to a random valid type
class_types = ['storage', 'storage', 'transaction', 'resolution',
'commit_proxy', 'grv_proxy', 'master', 'stateless', 'log',
'router', 'cluster_controller', 'fast_restore', 'data_distributor',
'coordinator', 'ratekeeper', 'storage_cache', 'backup'
]
class_types = ['storage', 'storage', 'transaction', 'resolution',
'commit_proxy', 'grv_proxy', 'master', 'stateless', 'log',
'router', 'cluster_controller', 'fast_restore', 'data_distributor',
'coordinator', 'ratekeeper', 'storage_cache', 'backup'
]
random_class_type = random.choice(class_types)
logger.debug("Change to type: {}".format(random_class_type))
run_fdbcli_command('setclass', network_address, random_class_type)
@ -134,6 +142,7 @@ def setclass(logger):
logger.debug(class_type_line_3)
assert class_type_line_3 == class_type_line_1
@enable_logging()
def lockAndUnlock(logger):
# lock an unlocked database, should be successful
@ -148,7 +157,7 @@ def lockAndUnlock(logger):
output2 = run_fdbcli_command_and_get_error("lock")
assert output2 == 'ERROR: Database is locked (1038)'
# unlock the database
process = subprocess.Popen(command_template + ['unlock ' + lock_uid], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
process = subprocess.Popen(command_template + ['unlock ' + lock_uid], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env)
line1 = process.stdout.readline()
# The randome passphrease we need to confirm to proceed the unlocking
line2 = process.stdout.readline()
@ -159,6 +168,7 @@ def lockAndUnlock(logger):
assert output3.decode('utf-8').strip() == 'Database unlocked.'
assert not get_value_from_status_json(True, 'cluster', 'database_lock_state', 'locked')
@enable_logging()
def kill(logger):
output1 = run_fdbcli_command('kill')
@ -168,11 +178,11 @@ def kill(logger):
address = lines[1]
logger.debug("Address: {}".format(address))
old_generation = get_value_from_status_json(False, 'cluster', 'generation')
# This is currently an issue with fdbcli,
# where you need to first run 'kill' to initialize processes' list
# This is currently an issue with fdbcli,
# where you need to first run 'kill' to initialize processes' list
# and then specify the certain process to kill
process = subprocess.Popen(command_template[:-1], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
#
process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env)
#
output2, err = process.communicate(input='kill; kill {}\n'.format(address).encode())
logger.debug(output2)
# wait for a second for the cluster recovery
@ -181,6 +191,7 @@ def kill(logger):
logger.debug("Old: {}, New: {}".format(old_generation, new_generation))
assert new_generation > old_generation
@enable_logging()
def suspend(logger):
output1 = run_fdbcli_command('suspend')
@ -200,7 +211,7 @@ def suspend(logger):
assert len(pinfo) == 1
pid = pinfo[0].split(' ')[0]
logger.debug("Pid: {}".format(pid))
process = subprocess.Popen(command_template[:-1], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env)
# suspend the process for enough long time
output2, err = process.communicate(input='suspend; suspend 3600 {}\n'.format(address).encode())
# the cluster should be unavailable after the only process being suspended
@ -213,7 +224,7 @@ def suspend(logger):
kill_output = subprocess.check_output(['kill', pid]).decode().strip()
logger.debug("Kill result: {}".format(kill_output))
# The process should come back after a few time
duration = 0 # seconds we already wait
duration = 0 # seconds we already wait
while not get_value_from_status_json(False, 'client', 'database_status', 'available') and duration < 60:
logger.debug("Sleep for 1 second to wait cluster recovery")
time.sleep(1)
@ -221,6 +232,7 @@ def suspend(logger):
# at most after 60 seconds, the cluster should be available
assert get_value_from_status_json(False, 'client', 'database_status', 'available')
def get_value_from_status_json(retry, *args):
while True:
result = json.loads(run_fdbcli_command('status', 'json'))
@ -229,9 +241,10 @@ def get_value_from_status_json(retry, *args):
for arg in args:
assert arg in result
result = result[arg]
return result
@enable_logging()
def consistencycheck(logger):
consistency_check_on_output = 'ConsistencyCheck is on'
@ -245,6 +258,7 @@ def consistencycheck(logger):
output3 = run_fdbcli_command('consistencycheck')
assert output3 == consistency_check_on_output
@enable_logging()
def cache_range(logger):
# this command is currently experimental
@ -252,6 +266,7 @@ def cache_range(logger):
run_fdbcli_command('cache_range', 'set', 'a', 'b')
run_fdbcli_command('cache_range', 'clear', 'a', 'b')
@enable_logging()
def datadistribution(logger):
output1 = run_fdbcli_command('datadistribution', 'off')
@ -271,6 +286,7 @@ def datadistribution(logger):
assert output6 == 'Data distribution is enabled for rebalance.'
time.sleep(1)
@enable_logging()
def transaction(logger):
"""This test will cover the transaction related fdbcli commands.
@ -280,7 +296,7 @@ def transaction(logger):
"""
err1 = run_fdbcli_command_and_get_error('set', 'key', 'value')
assert err1 == 'ERROR: writemode must be enabled to set or clear keys in the database.'
process = subprocess.Popen(command_template[:-1], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env)
transaction_flow = ['writemode on', 'begin', 'getversion', 'set key value', 'get key', 'commit']
output1, _ = process.communicate(input='\n'.join(transaction_flow).encode())
# split the output into lines
@ -299,13 +315,13 @@ def transaction(logger):
output2 = run_fdbcli_command('get', 'key')
assert output2 == "`key' is `value'"
# test rollback and read-your-write behavior
process = subprocess.Popen(command_template[:-1], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env)
transaction_flow = [
'writemode on', 'begin', 'getrange a z',
'writemode on', 'begin', 'getrange a z',
'clear key', 'get key',
# 'option on READ_YOUR_WRITES_DISABLE', 'get key',
'rollback'
]
]
output3, _ = process.communicate(input='\n'.join(transaction_flow).encode())
lines = list(filter(len, output3.decode().split('\n')))[-5:]
# lines[0] == "Transaction started" and lines[1] == 'Range limited to 25 keys'
@ -316,13 +332,13 @@ def transaction(logger):
output4 = run_fdbcli_command('get', 'key')
assert output4 == "`key' is `value'"
# test read_your_write_disable option and clear the inserted key
process = subprocess.Popen(command_template[:-1], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env)
transaction_flow = [
'writemode on', 'begin',
'option on READ_YOUR_WRITES_DISABLE',
'clear key', 'get key',
'commit'
]
]
output6, _ = process.communicate(input='\n'.join(transaction_flow).encode())
lines = list(filter(len, output6.decode().split('\n')))[-4:]
assert lines[1] == 'Option enabled for current transaction'
@ -332,15 +348,18 @@ def transaction(logger):
output7 = run_fdbcli_command('get', 'key')
assert output7 == "`key': not found"
def get_fdb_process_addresses():
def get_fdb_process_addresses(logger):
# get all processes' network addresses
output = run_fdbcli_command('kill')
logger.debug(output)
# except the first line, each line is one process
addresses = output.split('\n')[1:]
assert len(addresses) == process_number
assert len(addresses) == args.process_number
return addresses
@enable_logging()
@enable_logging(logging.DEBUG)
def coordinators(logger):
# we should only have one coordinator for now
output1 = run_fdbcli_command('coordinators')
@ -354,7 +373,7 @@ def coordinators(logger):
assert coordinator_list[0]['address'] == coordinators
# verify the cluster description
assert get_value_from_status_json(True, 'cluster', 'connection_string').startswith('{}:'.format(cluster_description))
addresses = get_fdb_process_addresses()
addresses = get_fdb_process_addresses(logger)
# set all 5 processes as coordinators and update the cluster description
new_cluster_description = 'a_simple_description'
run_fdbcli_command('coordinators', *addresses, 'description={}'.format(new_cluster_description))
@ -365,11 +384,13 @@ def coordinators(logger):
# auto change should go back to 1 coordinator
run_fdbcli_command('coordinators', 'auto')
assert len(get_value_from_status_json(True, 'client', 'coordinators', 'coordinators')) == 1
wait_for_database_available(logger)
@enable_logging()
@enable_logging(logging.DEBUG)
def exclude(logger):
# get all processes' network addresses
addresses = get_fdb_process_addresses()
addresses = get_fdb_process_addresses(logger)
logger.debug("Cluster processes: {}".format(' '.join(addresses)))
# There should be no excluded process for now
no_excluded_process_output = 'There are currently no servers or localities excluded from the database.'
@ -377,16 +398,28 @@ def exclude(logger):
assert no_excluded_process_output in output1
# randomly pick one and exclude the process
excluded_address = random.choice(addresses)
# If we see "not enough space" error, use FORCE option to proceed
# this should be a safe operation as we do not need any storage space for the test
force = False
# sometimes we need to retry the exclude
while True:
logger.debug("Excluding process: {}".format(excluded_address))
error_message = run_fdbcli_command_and_get_error('exclude', excluded_address)
if force:
error_message = run_fdbcli_command_and_get_error('exclude', 'FORCE', excluded_address)
else:
error_message = run_fdbcli_command_and_get_error('exclude', excluded_address)
if error_message == 'WARNING: {} is a coordinator!'.format(excluded_address):
# exclude coordinator will print the warning, verify the randomly selected process is the coordinator
coordinator_list = get_value_from_status_json(True, 'client', 'coordinators', 'coordinators')
assert len(coordinator_list) == 1
assert coordinator_list[0]['address'] == excluded_address
break
elif 'ERROR: This exclude may cause the total free space in the cluster to drop below 10%.' in error_message:
# exclude the process may cause the free space not enough
# use FORCE option to ignore it and proceed
assert not force
force = True
logger.debug("Use FORCE option to exclude the process")
elif not error_message:
break
else:
@ -400,8 +433,11 @@ def exclude(logger):
# check the include is successful
output4 = run_fdbcli_command('exclude')
assert no_excluded_process_output in output4
wait_for_database_available(logger)
# read the system key 'k', need to enable the option first
def read_system_key(k):
output = run_fdbcli_command('option', 'on', 'READ_SYSTEM_KEYS;', 'get', k)
if 'is' not in output:
@ -410,11 +446,14 @@ def read_system_key(k):
_, value = output.split(' is ')
return value
@enable_logging()
def throttle(logger):
# no throttled tags at the beginning
no_throttle_tags_output = 'There are no throttled tags'
assert run_fdbcli_command('throttle', 'list') == no_throttle_tags_output
output = run_fdbcli_command('throttle', 'list')
logger.debug(output)
assert output == no_throttle_tags_output
# test 'throttle enable auto'
run_fdbcli_command('throttle', 'enable', 'auto')
# verify the change is applied by reading the system key
@ -427,17 +466,46 @@ def throttle(logger):
assert enable_flag == "`0'"
# TODO : test manual throttling, not easy to do now
def wait_for_database_available(logger):
# sometimes the change takes some time to have effect and the database can be unavailable at that time
# this is to wait until the database is available again
while not get_value_from_status_json(True, 'client', 'database_status', 'available'):
logger.debug("Database unavailable for now, wait for one second")
time.sleep(1)
if __name__ == '__main__':
# fdbcli_tests.py <path_to_fdbcli_binary> <path_to_fdb_cluster_file> <process_number>
assert len(sys.argv) == 4, "Please pass arguments: <path_to_fdbcli_binary> <path_to_fdb_cluster_file> <process_number>"
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
description="""
The test calls fdbcli commands through fdbcli --exec "<command>" interactively using subprocess.
The outputs from fdbcli are returned and compared to predefined results.
Consequently, changing fdbcli outputs or breaking any commands will casue the test to fail.
Commands that are easy to test will run against a single process cluster.
For complex commands like exclude, they will run against a cluster with multiple(current set to 5) processes.
If external_client_library is given, we will disable the local client and use the external client to run fdbcli.
""")
parser.add_argument('build_dir', metavar='BUILD_DIRECTORY', help='FDB build directory')
parser.add_argument('cluster_file', metavar='CLUSTER_FILE', help='FDB cluster file')
parser.add_argument('process_number', nargs='?', metavar='PROCESS_NUMBER', help="Number of fdb processes", type=int, default=1)
parser.add_argument('--external-client-library', '-e', metavar='EXTERNAL_CLIENT_LIBRARY_PATH', help="External client library path")
args = parser.parse_args()
# keep current environment variables
fdbcli_env = os.environ.copy()
# set external client library if provided
if args.external_client_library:
# disable local client and use the external client library
fdbcli_env['FDB_NETWORK_OPTION_DISABLE_LOCAL_CLIENT'] = ''
fdbcli_env['FDB_NETWORK_OPTION_EXTERNAL_CLIENT_LIBRARY'] = args.external_client_library
# shell command template
command_template = [sys.argv[1], '-C', sys.argv[2], '--exec']
command_template = [args.build_dir + '/bin/fdbcli', '-C', args.cluster_file, '--exec']
# tests for fdbcli commands
# assertions will fail if fdbcli does not work as expected
process_number = int(sys.argv[3])
if process_number == 1:
if args.process_number == 1:
# TODO: disable for now, the change can cause the database unavailable
#advanceversion()
# advanceversion()
cache_range()
consistencycheck()
datadistribution()
@ -449,11 +517,6 @@ if __name__ == '__main__':
transaction()
throttle()
else:
assert process_number > 1, "Process number should be positive"
# the kill command which used to list processes seems to not work as expected sometime
# which makes the test flaky.
# We need to figure out the reason and then re-enable these tests
#coordinators()
#exclude()
assert args.process_number > 1, "Process number should be positive"
coordinators()
exclude()

View File

@ -352,7 +352,7 @@ function(package_bindingtester)
COMMENT "Copy Flow tester for bindingtester")
set(generated_binding_files python/fdb/fdboptions.py)
if(WITH_JAVA)
if(WITH_JAVA_BINDING)
if(NOT FDB_RELEASE)
set(prerelease_string "-PRERELEASE")
else()
@ -369,7 +369,7 @@ function(package_bindingtester)
set(generated_binding_files ${generated_binding_files} java/foundationdb-tests.jar)
endif()
if(WITH_GO AND NOT OPEN_FOR_IDE)
if(WITH_GO_BINDING AND NOT OPEN_FOR_IDE)
add_dependencies(copy_binding_output_files fdb_go_tester fdb_go)
add_custom_command(
TARGET copy_binding_output_files

View File

@ -42,27 +42,11 @@ else()
set(WITH_TLS OFF)
endif()
if(WIN32)
message(STATUS "TLS is temporarilty disabled on macOS while libressl -> openssl transition happens")
message(STATUS "TLS is temporarilty disabled on Windows while libressl -> openssl transition happens")
set(WITH_TLS OFF)
endif()
endif()
################################################################################
# Java Bindings
################################################################################
set(WITH_JAVA OFF)
find_package(JNI 1.8)
find_package(Java 1.8 COMPONENTS Development)
# leave FreeBSD JVM compat for later
if(JNI_FOUND AND Java_FOUND AND Java_Development_FOUND AND NOT (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD"))
set(WITH_JAVA ON)
include(UseJava)
enable_language(Java)
else()
set(WITH_JAVA OFF)
endif()
################################################################################
# Python Bindings
################################################################################
@ -75,12 +59,57 @@ else()
set(WITH_PYTHON OFF)
endif()
option(BUILD_PYTHON_BINDING "build python binding" ON)
if(NOT BUILD_PYTHON_BINDING OR NOT WITH_PYTHON)
set(WITH_PYTHON_BINDING OFF)
else()
if(WITH_PYTHON)
set(WITH_PYTHON_BINDING ON)
else()
#message(FATAL_ERROR "Could not found a suitable python interpreter")
set(WITH_PYTHON_BINDING OFF)
endif()
endif()
################################################################################
# C Bindings
################################################################################
option(BUILD_C_BINDING "build C binding" ON)
if(BUILD_C_BINDING AND WITH_PYTHON)
set(WITH_C_BINDING ON)
else()
set(WITH_C_BINDING OFF)
endif()
################################################################################
# Java Bindings
################################################################################
option(BUILD_JAVA_BINDING "build java binding" ON)
if(NOT BUILD_JAVA_BINDING OR NOT WITH_C_BINDING)
set(WITH_JAVA_BINDING OFF)
else()
set(WITH_JAVA_BINDING OFF)
find_package(JNI 1.8)
find_package(Java 1.8 COMPONENTS Development)
# leave FreeBSD JVM compat for later
if(JNI_FOUND AND Java_FOUND AND Java_Development_FOUND AND NOT (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") AND WITH_C_BINDING)
set(WITH_JAVA_BINDING ON)
include(UseJava)
enable_language(Java)
else()
set(WITH_JAVA_BINDING OFF)
endif()
endif()
################################################################################
# Pip
################################################################################
option(BUILD_DOCUMENTATION "build documentation" ON)
find_package(Python3 COMPONENTS Interpreter)
if (Python3_Interpreter_FOUND)
if (WITH_PYTHON AND Python3_Interpreter_FOUND AND BUILD_DOCUMENTATION)
set(WITH_DOCUMENTATION ON)
else()
set(WITH_DOCUMENTATION OFF)
@ -90,27 +119,37 @@ endif()
# GO
################################################################################
find_program(GO_EXECUTABLE go)
# building the go binaries is currently not supported on Windows
if(GO_EXECUTABLE AND NOT WIN32)
set(WITH_GO ON)
option(BUILD_GO_BINDING "build go binding" ON)
if(NOT BUILD_GO_BINDING OR NOT BUILD_C_BINDING)
set(WITH_GO_BINDING OFF)
else()
set(WITH_GO OFF)
endif()
if (USE_SANITIZER)
# Disable building go for sanitizers, since _stacktester doesn't link properly
set(WITH_GO OFF)
find_program(GO_EXECUTABLE go)
# building the go binaries is currently not supported on Windows
if(GO_EXECUTABLE AND NOT WIN32 AND WITH_C_BINDING)
set(WITH_GO_BINDING ON)
else()
set(WITH_GO_BINDING OFF)
endif()
if (USE_SANITIZER)
# Disable building go for sanitizers, since _stacktester doesn't link properly
set(WITH_GO_BINDING OFF)
endif()
endif()
################################################################################
# Ruby
################################################################################
find_program(GEM_EXECUTABLE gem)
set(WITH_RUBY OFF)
if(GEM_EXECUTABLE)
set(GEM_COMMAND ${RUBY_EXECUTABLE} ${GEM_EXECUTABLE})
set(WITH_RUBY ON)
option(BUILD_RUBY_BINDING "build ruby binding" ON)
if(NOT BUILD_RUBY_BINDING OR NOT BUILD_C_BINDING)
set(WITH_RUBY_BINDING OFF)
else()
find_program(GEM_EXECUTABLE gem)
set(WITH_RUBY_BINDING OFF)
if(GEM_EXECUTABLE AND WITH_C_BINDING)
set(GEM_COMMAND ${RUBY_EXECUTABLE} ${GEM_EXECUTABLE})
set(WITH_RUBY_BINDING ON)
endif()
endif()
################################################################################
@ -160,20 +199,22 @@ function(print_components)
message(STATUS "=========================================")
message(STATUS " Components Build Overview ")
message(STATUS "=========================================")
message(STATUS "Build Java Bindings: ${WITH_JAVA}")
message(STATUS "Build with TLS support: ${WITH_TLS}")
message(STATUS "Build Go bindings: ${WITH_GO}")
message(STATUS "Build Ruby bindings: ${WITH_RUBY}")
message(STATUS "Build Python sdist (make package): ${WITH_PYTHON}")
message(STATUS "Build Documentation (make html): ${WITH_DOCUMENTATION}")
message(STATUS "Build Bindings (depends on Python): ${WITH_PYTHON}")
message(STATUS "Build C Bindings: ${WITH_C_BINDING}")
message(STATUS "Build Python Bindings: ${WITH_PYTHON_BINDING}")
message(STATUS "Build Java Bindings: ${WITH_JAVA_BINDING}")
message(STATUS "Build Go bindings: ${WITH_GO_BINDING}")
message(STATUS "Build Ruby bindings: ${WITH_RUBY_BINDING}")
message(STATUS "Build with TLS support: ${WITH_TLS}")
message(STATUS "Build Documentation (make html): ${WITH_DOCUMENTATION}")
message(STATUS "Build Python sdist (make package): ${WITH_PYTHON_BINDING}")
message(STATUS "Configure CTest (depends on Python): ${WITH_PYTHON}")
message(STATUS "Build with RocksDB: ${WITH_ROCKSDB_EXPERIMENTAL}")
message(STATUS "=========================================")
endfunction()
if(FORCE_ALL_COMPONENTS)
if(NOT WITH_JAVA OR NOT WITH_TLS OR NOT WITH_GO OR NOT WITH_RUBY OR NOT WITH_PYTHON OR NOT WITH_DOCUMENTATION)
if(NOT WITH_C_BINDING OR NOT WITH_JAVA_BINDING OR NOT WITH_TLS OR NOT WITH_GO_BINDING OR NOT WITH_RUBY_BINDING OR NOT WITH_PYTHON_BINDING OR NOT WITH_DOCUMENTATION)
print_components()
message(FATAL_ERROR "FORCE_ALL_COMPONENTS is set but not all dependencies could be found")
endif()

20
cmake/GetMsgpack.cmake Normal file
View File

@ -0,0 +1,20 @@
find_package(msgpack 3.3.0 EXACT QUIET CONFIG)
add_library(msgpack INTERFACE)
if(msgpack_FOUND)
target_link_libraries(msgpack INTERFACE msgpackc-cxx)
else()
include(ExternalProject)
ExternalProject_add(msgpackProject
URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
)
ExternalProject_Get_property(msgpackProject SOURCE_DIR)
target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
add_dependencies(msgpack msgpackProject)
endif()

View File

@ -63,7 +63,7 @@ Source IP:port 0 string The IP and port of the machine where the s
Trace ID 1 uint64 The 64-bit identifier of the trace. All spans in a trace share the same trace ID.
Span ID 2 uint64 The 64-bit identifier of the span. All spans have a unique identifier.
Start timestamp 3 double The timestamp when the operation represented by the span began.
End timestamp 4 double The timestamp when the operation represented by the span ended.
Duration 4 double The duration in seconds of the operation represented by the span.
Operation name 5 string The name of the operation the span represents.
Tags 6 map User defined tags, added manually to specify additional information.
Parent span IDs 7 vector (Optional) A list of span IDs representing parents of this span.

View File

@ -506,7 +506,7 @@ ACTOR Future<Void> decode_logs(DecodeParams params) {
wait(process_file(container, logs[idx], uid, params));
idx++;
}
TraceEvent("DecodeDone", uid);
TraceEvent("DecodeDone", uid).log();
return Void();
}

View File

@ -1667,7 +1667,7 @@ ACTOR Future<Void> cleanupStatus(Reference<ReadYourWritesTransaction> tr,
readMore = true;
} catch (Error& e) {
// If doc can't be parsed or isn't alive, delete it.
TraceEvent(SevWarn, "RemovedDeadBackupLayerStatus").detail("Key", docs[i].key);
TraceEvent(SevWarn, "RemovedDeadBackupLayerStatus").detail("Key", docs[i].key).error(e, true);
tr->clear(docs[i].key);
// If limit is 1 then read more.
if (limit == 1)

View File

@ -21,345 +21,16 @@
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/IClientApi.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/CommitTransaction.h"
#include "flow/Arena.h"
#include "flow/FastRef.h"
#include "flow/ThreadHelper.actor.h"
#include "flow/genericactors.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace {
// Helper functions copied from TagThrottle.actor.cpp
// The only difference is transactions are changed to go through MultiversionTransaction,
// instead of the native Transaction(i.e., RYWTransaction)
ACTOR Future<bool> getValidAutoEnabled(Reference<ITransaction> tr) {
state bool result;
loop {
Optional<Value> value = wait(safeThreadFutureToFuture(tr->get(tagThrottleAutoEnabledKey)));
if (!value.present()) {
tr->reset();
wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
continue;
} else if (value.get() == LiteralStringRef("1")) {
result = true;
} else if (value.get() == LiteralStringRef("0")) {
result = false;
} else {
TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue").detail("Value", value.get());
tr->reset();
wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
continue;
}
return result;
};
}
ACTOR Future<std::vector<TagThrottleInfo>> getThrottledTags(Reference<IDatabase> db,
int limit,
bool containsRecommend = false) {
state Reference<ITransaction> tr = db->createTransaction();
state bool reportAuto = containsRecommend;
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
if (!containsRecommend) {
wait(store(reportAuto, getValidAutoEnabled(tr)));
}
state ThreadFuture<RangeResult> f = tr->getRange(
reportAuto ? tagThrottleKeys : KeyRangeRef(tagThrottleKeysPrefix, tagThrottleAutoKeysPrefix), limit);
RangeResult throttles = wait(safeThreadFutureToFuture(f));
std::vector<TagThrottleInfo> results;
for (auto throttle : throttles) {
results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key),
TagThrottleValue::fromValue(throttle.value)));
}
return results;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<std::vector<TagThrottleInfo>> getRecommendedTags(Reference<IDatabase> db, int limit) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
bool enableAuto = wait(getValidAutoEnabled(tr));
if (enableAuto) {
return std::vector<TagThrottleInfo>();
}
state ThreadFuture<RangeResult> f =
tr->getRange(KeyRangeRef(tagThrottleAutoKeysPrefix, tagThrottleKeys.end), limit);
RangeResult throttles = wait(safeThreadFutureToFuture(f));
std::vector<TagThrottleInfo> results;
for (auto throttle : throttles) {
results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key),
TagThrottleValue::fromValue(throttle.value)));
}
return results;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<Void> updateThrottleCount(Reference<ITransaction> tr, int64_t delta) {
state ThreadFuture<Optional<Value>> countVal = tr->get(tagThrottleCountKey);
state ThreadFuture<Optional<Value>> limitVal = tr->get(tagThrottleLimitKey);
wait(success(safeThreadFutureToFuture(countVal)) && success(safeThreadFutureToFuture(limitVal)));
int64_t count = 0;
int64_t limit = 0;
if (countVal.get().present()) {
BinaryReader reader(countVal.get().get(), Unversioned());
reader >> count;
}
if (limitVal.get().present()) {
BinaryReader reader(limitVal.get().get(), Unversioned());
reader >> limit;
}
count += delta;
if (count > limit) {
throw too_many_tag_throttles();
}
BinaryWriter writer(Unversioned());
writer << count;
tr->set(tagThrottleCountKey, writer.toValue());
return Void();
}
void signalThrottleChange(Reference<ITransaction> tr) {
tr->atomicOp(
tagThrottleSignalKey, LiteralStringRef("XXXXXXXXXX\x00\x00\x00\x00"), MutationRef::SetVersionstampedValue);
}
ACTOR Future<Void> throttleTags(Reference<IDatabase> db,
TagSet tags,
double tpsRate,
double initialDuration,
TagThrottleType throttleType,
TransactionPriority priority,
Optional<double> expirationTime = Optional<double>(),
Optional<TagThrottledReason> reason = Optional<TagThrottledReason>()) {
state Reference<ITransaction> tr = db->createTransaction();
state Key key = TagThrottleKey(tags, throttleType, priority).toKey();
ASSERT(initialDuration > 0);
if (throttleType == TagThrottleType::MANUAL) {
reason = TagThrottledReason::MANUAL;
}
TagThrottleValue throttle(tpsRate,
expirationTime.present() ? expirationTime.get() : 0,
initialDuration,
reason.present() ? reason.get() : TagThrottledReason::UNSET);
BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
wr << throttle;
state Value value = wr.toValue();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
if (throttleType == TagThrottleType::MANUAL) {
Optional<Value> oldThrottle = wait(safeThreadFutureToFuture(tr->get(key)));
if (!oldThrottle.present()) {
wait(updateThrottleCount(tr, 1));
}
}
tr->set(key, value);
if (throttleType == TagThrottleType::MANUAL) {
signalThrottleChange(tr);
}
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<bool> unthrottleTags(Reference<IDatabase> db,
TagSet tags,
Optional<TagThrottleType> throttleType,
Optional<TransactionPriority> priority) {
state Reference<ITransaction> tr = db->createTransaction();
state std::vector<Key> keys;
for (auto p : allTransactionPriorities) {
if (!priority.present() || priority.get() == p) {
if (!throttleType.present() || throttleType.get() == TagThrottleType::AUTO) {
keys.push_back(TagThrottleKey(tags, TagThrottleType::AUTO, p).toKey());
}
if (!throttleType.present() || throttleType.get() == TagThrottleType::MANUAL) {
keys.push_back(TagThrottleKey(tags, TagThrottleType::MANUAL, p).toKey());
}
}
}
state bool removed = false;
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state std::vector<Future<Optional<Value>>> values;
values.reserve(keys.size());
for (auto key : keys) {
values.push_back(safeThreadFutureToFuture(tr->get(key)));
}
wait(waitForAll(values));
int delta = 0;
for (int i = 0; i < values.size(); ++i) {
if (values[i].get().present()) {
if (TagThrottleKey::fromKey(keys[i]).throttleType == TagThrottleType::MANUAL) {
delta -= 1;
}
tr->clear(keys[i]);
// Report that we are removing this tag if we ever see it present.
// This protects us from getting confused if the transaction is maybe committed.
// It's ok if someone else actually ends up removing this tag at the same time
// and we aren't the ones to actually do it.
removed = true;
}
}
if (delta != 0) {
wait(updateThrottleCount(tr, delta));
}
if (removed) {
signalThrottleChange(tr);
wait(safeThreadFutureToFuture(tr->commit()));
}
return removed;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<Void> enableAuto(Reference<IDatabase> db, bool enabled) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
Optional<Value> value = wait(safeThreadFutureToFuture(tr->get(tagThrottleAutoEnabledKey)));
if (!value.present() || (enabled && value.get() != LiteralStringRef("1")) ||
(!enabled && value.get() != LiteralStringRef("0"))) {
tr->set(tagThrottleAutoEnabledKey, LiteralStringRef(enabled ? "1" : "0"));
signalThrottleChange(tr);
wait(safeThreadFutureToFuture(tr->commit()));
}
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<bool> unthrottleMatchingThrottles(Reference<IDatabase> db,
KeyRef beginKey,
KeyRef endKey,
Optional<TransactionPriority> priority,
bool onlyExpiredThrottles) {
state Reference<ITransaction> tr = db->createTransaction();
state KeySelector begin = firstGreaterOrEqual(beginKey);
state KeySelector end = firstGreaterOrEqual(endKey);
state bool removed = false;
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
// holds memory of the RangeResult
state ThreadFuture<RangeResult> f = tr->getRange(begin, end, 1000);
state RangeResult tags = wait(safeThreadFutureToFuture(f));
state uint64_t unthrottledTags = 0;
uint64_t manualUnthrottledTags = 0;
for (auto tag : tags) {
if (onlyExpiredThrottles) {
double expirationTime = TagThrottleValue::fromValue(tag.value).expirationTime;
if (expirationTime == 0 || expirationTime > now()) {
continue;
}
}
TagThrottleKey key = TagThrottleKey::fromKey(tag.key);
if (priority.present() && key.priority != priority.get()) {
continue;
}
if (key.throttleType == TagThrottleType::MANUAL) {
++manualUnthrottledTags;
}
removed = true;
tr->clear(tag.key);
unthrottledTags++;
}
if (manualUnthrottledTags > 0) {
wait(updateThrottleCount(tr, -manualUnthrottledTags));
}
if (unthrottledTags > 0) {
signalThrottleChange(tr);
}
wait(safeThreadFutureToFuture(tr->commit()));
if (!tags.more) {
return removed;
}
ASSERT(tags.size() > 0);
begin = KeySelector(firstGreaterThan(tags[tags.size() - 1].key), tags.arena());
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
Future<bool> unthrottleAll(Reference<IDatabase> db,
Optional<TagThrottleType> tagThrottleType,
Optional<TransactionPriority> priority) {
KeyRef begin = tagThrottleKeys.begin;
KeyRef end = tagThrottleKeys.end;
if (tagThrottleType.present() && tagThrottleType == TagThrottleType::AUTO) {
begin = tagThrottleAutoKeysPrefix;
} else if (tagThrottleType.present() && tagThrottleType == TagThrottleType::MANUAL) {
end = tagThrottleAutoKeysPrefix;
}
return unthrottleMatchingThrottles(db, begin, end, priority, false);
}
} // namespace
namespace fdb_cli {
ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
@ -403,11 +74,11 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
state std::vector<TagThrottleInfo> tags;
if (reportThrottled && reportRecommended) {
wait(store(tags, getThrottledTags(db, throttleListLimit, true)));
wait(store(tags, ThrottleApi::getThrottledTags(db, throttleListLimit, true)));
} else if (reportThrottled) {
wait(store(tags, getThrottledTags(db, throttleListLimit)));
wait(store(tags, ThrottleApi::getThrottledTags(db, throttleListLimit)));
} else if (reportRecommended) {
wait(store(tags, getRecommendedTags(db, throttleListLimit)));
wait(store(tags, ThrottleApi::getRecommendedTags(db, throttleListLimit)));
}
bool anyLogged = false;
@ -509,7 +180,7 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
TagSet tags;
tags.addTag(tokens[3]);
wait(throttleTags(db, tags, tpsRate, duration, TagThrottleType::MANUAL, priority));
wait(ThrottleApi::throttleTags(db, tags, tpsRate, duration, TagThrottleType::MANUAL, priority));
printf("Tag `%s' has been throttled\n", tokens[3].toString().c_str());
} else if (tokencmp(tokens[1], "off")) {
int nextIndex = 2;
@ -586,7 +257,7 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
priority.present() ? format(" at %s priority", transactionPriorityToString(priority.get(), false)) : "";
if (tags.size() > 0) {
bool success = wait(unthrottleTags(db, tags, throttleType, priority));
bool success = wait(ThrottleApi::unthrottleTags(db, tags, throttleType, priority));
if (success) {
printf("Unthrottled tag `%s'%s\n", tokens[3].toString().c_str(), priorityString.c_str());
} else {
@ -596,7 +267,7 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
priorityString.c_str());
}
} else {
bool unthrottled = wait(unthrottleAll(db, throttleType, priority));
bool unthrottled = wait(ThrottleApi::unthrottleAll(db, throttleType, priority));
if (unthrottled) {
printf("Unthrottled all %sthrottled tags%s\n", throttleTypeString, priorityString.c_str());
} else {
@ -626,7 +297,7 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
return false;
}
state bool autoTagThrottlingEnabled = tokencmp(tokens[1], "enable");
wait(enableAuto(db, autoTagThrottlingEnabled));
wait(ThrottleApi::enableAuto(db, autoTagThrottlingEnabled));
printf("Automatic tag throttling has been %s\n", autoTagThrottlingEnabled ? "enabled" : "disabled");
} else {
printUsage(tokens[0]);

View File

@ -36,7 +36,7 @@
#include "fdbclient/Schemas.h"
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/Tuple.h"
#include "fdbclient/ThreadSafeTransaction.h"
@ -3195,10 +3195,10 @@ ACTOR template <class T>
Future<T> stopNetworkAfter(Future<T> what) {
try {
T t = wait(what);
g_network->stop();
API->stopNetwork();
return t;
} catch (...) {
g_network->stop();
API->stopNetwork();
throw;
}
}
@ -4647,7 +4647,7 @@ int main(int argc, char** argv) {
Future<int> cliFuture = runCli(opt);
Future<Void> timeoutFuture = opt.exit_timeout ? timeExit(opt.exit_timeout) : Never();
auto f = stopNetworkAfter(success(cliFuture) || timeoutFuture);
runNetwork();
API->runNetwork();
if (cliFuture.isReady()) {
return cliFuture.get();

View File

@ -0,0 +1,394 @@
/*
* ActorLineageProfiler.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "flow/flow.h"
#include "flow/singleton.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/NameLineage.h"
#include <msgpack.hpp>
#include <memory>
#include <typeindex>
#include <boost/endian/conversion.hpp>
#include <boost/asio.hpp>
using namespace std::literals;
class Packer : public msgpack::packer<msgpack::sbuffer> {
struct visitor_t {
using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
VisitorMap visitorMap;
template <class T>
static void any_visitor(std::any const& val, Packer& packer) {
const T& v = std::any_cast<const T&>(val);
packer.pack(v);
}
template <class... Args>
struct populate_visitor_map;
template <class Head, class... Tail>
struct populate_visitor_map<Head, Tail...> {
static void populate(VisitorMap& map) {
map.emplace(std::type_index(typeid(Head)), any_visitor<Head>);
populate_visitor_map<Tail...>::populate(map);
}
};
template <class Head>
struct populate_visitor_map<Head> {
static void populate(VisitorMap&) {}
};
visitor_t() {
populate_visitor_map<int64_t,
uint64_t,
bool,
float,
double,
std::string,
std::string_view,
std::vector<std::any>,
std::vector<std::string>,
std::vector<std::string_view>,
std::map<std::string, std::any>,
std::map<std::string_view, std::any>,
std::vector<std::map<std::string_view, std::any>>>::populate(visitorMap);
}
void visit(const std::any& val, Packer& packer) {
auto iter = visitorMap.find(val.type());
if (iter == visitorMap.end()) {
TraceEvent(SevError, "PackerTypeNotFound").detail("Type", val.type().name());
} else {
iter->second(val, packer);
}
}
};
msgpack::sbuffer sbuffer;
// Initializing visitor_t involves building a type-map. As this is a relatively expensive operation, we don't want
// to do this each time we create a Packer object. So visitor_t is a stateless class and we only use it as a
// visitor.
crossbow::singleton<visitor_t> visitor;
public:
Packer() : msgpack::packer<msgpack::sbuffer>(sbuffer) {}
void pack(std::any const& val) { visitor->visit(val, *this); }
void pack(bool val) {
if (val) {
pack_true();
} else {
pack_false();
}
}
void pack(uint64_t val) {
if (val <= std::numeric_limits<uint8_t>::max()) {
pack_uint8(uint8_t(val));
} else if (val <= std::numeric_limits<uint16_t>::max()) {
pack_uint16(uint16_t(val));
} else if (val <= std::numeric_limits<uint32_t>::max()) {
pack_uint32(uint32_t(val));
} else {
pack_uint64(val);
}
}
void pack(int64_t val) {
if (val >= 0) {
this->pack(uint64_t(val));
} else if (val >= std::numeric_limits<uint8_t>::min()) {
pack_int8(int8_t(val));
} else if (val >= std::numeric_limits<int16_t>::min()) {
pack_int16(int16_t(val));
} else if (val >= std::numeric_limits<int32_t>::min()) {
pack_int32(int32_t(val));
} else if (val >= std::numeric_limits<int64_t>::min()) {
pack_int64(int64_t(val));
}
}
void pack(float val) { pack_float(val); }
void pack(double val) { pack_double(val); }
void pack(std::string const& str) {
pack_str(str.size());
pack_str_body(str.data(), str.size());
}
void pack(std::string_view val) {
pack_str(val.size());
pack_str_body(val.data(), val.size());
}
template <class K, class V>
void pack(std::map<K, V> const& map) {
pack_map(map.size());
for (const auto& p : map) {
pack(p.first);
pack(p.second);
}
}
template <class T>
void pack(std::vector<T> const& val) {
pack_array(val.size());
for (const auto& v : val) {
pack(v);
}
}
std::pair<char*, unsigned> getbuf() {
unsigned size = sbuffer.size();
return std::make_pair(sbuffer.release(), size);
}
};
IALPCollectorBase::IALPCollectorBase() {
SampleCollector::instance().addCollector(this);
}
std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lineage) {
ASSERT(lineage != nullptr);
std::map<std::string_view, std::any> out;
for (auto& collector : collectors) {
auto val = collector->collect(lineage);
if (val.has_value()) {
out[collector->name()] = val.value();
}
}
return out;
}
std::shared_ptr<Sample> SampleCollectorT::collect() {
auto sample = std::make_shared<Sample>();
double time = g_network->now();
sample->time = time;
for (auto& p : getSamples) {
Packer packer;
std::vector<std::map<std::string_view, std::any>> samples;
auto sampleVec = p.second();
for (auto& val : sampleVec) {
auto m = collect(val.getPtr());
if (!m.empty()) {
samples.emplace_back(std::move(m));
}
}
if (!samples.empty()) {
packer.pack(samples);
sample->data[p.first] = packer.getbuf();
}
}
return sample;
}
void SampleCollection_t::collect(const Reference<ActorLineage>& lineage) {
ASSERT(lineage.isValid());
_currentLineage = lineage;
auto sample = _collector->collect();
ASSERT(sample);
{
Lock _{ mutex };
data.emplace_back(sample);
}
auto min = std::min(data.back()->time - windowSize, data.back()->time);
double oldest = data.front()->time;
// we don't need to check for data.empty() in this loop (or the inner loop) as we know that we will end
// up with at least one entry which is the most recent sample
while (oldest < min) {
Lock _{ mutex };
// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
for (int i = 0; i < 10 && oldest < min; ++i) {
data.pop_front();
oldest = data.front()->time;
}
}
// TODO: Should only call ingest when deleting from memory
config->ingest(sample);
}
std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
double to /*= std::numeric_limits<double>::max()*/) const {
Lock _{ mutex };
std::vector<std::shared_ptr<Sample>> res;
for (const auto& sample : data) {
if (sample->time > to) {
break;
} else if (sample->time >= from) {
res.push_back(sample);
}
}
return res;
}
void sample(LineageReference* lineagePtr) {
if (!lineagePtr->isValid()) { return; }
(*lineagePtr)->modify(&NameLineage::actorName) = lineagePtr->actorName();
boost::asio::post(ActorLineageProfiler::instance().context(), [lineage = LineageReference::addRef(lineagePtr->getPtr())]() {
SampleCollection::instance().collect(lineage);
});
}
struct ProfilerImpl {
boost::asio::io_context context;
boost::asio::executor_work_guard<decltype(context.get_executor())> workGuard;
boost::asio::steady_timer timer;
std::thread mainThread;
unsigned frequency;
SampleCollection collection;
ProfilerImpl() : workGuard(context.get_executor()), timer(context) {
mainThread = std::thread([this]() { context.run(); });
}
~ProfilerImpl() {
setFrequency(0);
workGuard.reset();
mainThread.join();
}
void profileHandler(boost::system::error_code const& ec) {
if (ec) {
return;
}
startSampling = true;
timer = boost::asio::steady_timer(context, std::chrono::microseconds(1000000 / frequency));
timer.async_wait([this](auto const& ec) { profileHandler(ec); });
}
void setFrequency(unsigned frequency) {
boost::asio::post(context, [this, frequency]() {
this->frequency = frequency;
timer.cancel();
if (frequency > 0) {
profileHandler(boost::system::error_code{});
}
});
}
};
ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
// collection->collector()->addGetter(WaitState::Network,
// std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
// collection->collector()->addGetter(
// WaitState::Disk,
// std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
collection->collector()->addGetter(WaitState::Running, []() {
return std::vector<Reference<ActorLineage>>({ SampleCollection::instance().getLineage() });
});
}
ActorLineageProfilerT::~ActorLineageProfilerT() {
delete impl;
}
void ActorLineageProfilerT::setFrequency(unsigned frequency) {
impl->setFrequency(frequency);
}
boost::asio::io_context& ActorLineageProfilerT::context() {
return impl->context;
}
SampleIngestor::~SampleIngestor() {}
void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
bool expectNoMore = false, useFluentD = false, useTCP = false;
std::string endpoint;
ConfigError err;
for (auto& kv : config) {
if (expectNoMore) {
err.description = format("Unexpected option %s", kv.first.c_str());
throw err;
}
if (kv.first == "ingestor") {
std::string val = kv.second;
std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
if (val == "none") {
setBackend(std::make_shared<NoneIngestor>());
} else if (val == "fluentd") {
useFluentD = true;
} else {
err.description = format("Unsupported ingestor: %s", val.c_str());
throw err;
}
} else if (kv.first == "ingestor_endpoint") {
endpoint = kv.second;
} else if (kv.first == "ingestor_protocol") {
auto val = kv.second;
std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
if (val == "tcp") {
useTCP = true;
} else if (val == "udp") {
useTCP = false;
} else {
err.description = format("Unsupported protocol for fluentd: %s", kv.second.c_str());
throw err;
}
} else {
err.description = format("Unknown option %s", kv.first.c_str());
throw err;
}
}
if (useFluentD) {
if (endpoint.empty()) {
err.description = "Endpoint is required for fluentd ingestor";
throw err;
}
NetworkAddress address;
try {
address = NetworkAddress::parse(endpoint);
} catch (Error& e) {
err.description = format("Can't parse address %s", endpoint.c_str());
throw err;
}
setBackend(std::make_shared<FluentDIngestor>(
useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::UDP, address));
}
}
std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
std::map<std::string, std::string> res;
if (ingestor) {
ingestor->getConfig(res);
}
return res;
}
// Callback used to update the sampling profilers run frequency whenever the
// frequency changes.
void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
double frequency = 0;
if (freq.has_value()) {
frequency = std::any_cast<double>(freq.value());
}
TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
ActorLineageProfiler::instance().setFrequency(frequency);
}
// Callback used to update the sample collector window size.
void samplingProfilerUpdateWindow(std::optional<std::any> window) {
double duration = 0;
if (window.has_value()) {
duration = std::any_cast<double>(window.value());
}
TraceEvent(SevInfo, "SamplingProfilerUpdateWindow").detail("Duration", duration);
SampleCollection::instance().setWindowSize(duration);
}

View File

@ -0,0 +1,192 @@
/*
* ActorLineageProfiler.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/AnnotateActor.h"
#include <optional>
#include <string>
#include <any>
#include <vector>
#include <mutex>
#include <condition_variable>
#include "flow/singleton.h"
#include "flow/flow.h"
void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
void samplingProfilerUpdateWindow(std::optional<std::any> window);
struct IALPCollectorBase {
virtual std::optional<std::any> collect(ActorLineage*) = 0;
virtual const std::string_view& name() = 0;
IALPCollectorBase();
};
template <class T>
struct IALPCollector : IALPCollectorBase {
const std::string_view& name() override { return T::name; }
};
struct Sample : std::enable_shared_from_this<Sample> {
double time = 0.0;
Sample() {}
Sample(Sample const&) = delete;
Sample& operator=(Sample const&) = delete;
std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
~Sample() {
std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {
::free(entry.second.first);
});
}
};
class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
public:
virtual ~SampleIngestor();
virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
virtual void getConfig(std::map<std::string, std::string>&) const = 0;
};
class NoneIngestor : public SampleIngestor {
public:
void ingest(std::shared_ptr<Sample> const& sample) override {}
void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
};
// The FluentD ingestor uses the pimpl idiom. This is to make compilation less heavy weight as this implementation has
// dependencies to boost::asio
struct FluentDIngestorImpl;
class FluentDIngestor : public SampleIngestor {
public: // Public Types
enum class Protocol { TCP, UDP };
private: // members
FluentDIngestorImpl* impl;
public: // interface
void ingest(std::shared_ptr<Sample> const& sample) override;
FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
void getConfig(std::map<std::string, std::string>& res) const override;
~FluentDIngestor();
};
struct ConfigError {
std::string description;
};
class ProfilerConfigT {
private: // private types
using Lock = std::unique_lock<std::mutex>;
friend class crossbow::create_static<ProfilerConfigT>;
private: // members
std::shared_ptr<SampleIngestor> ingestor = std::make_shared<NoneIngestor>();
private: // construction
ProfilerConfigT() {}
ProfilerConfigT(ProfilerConfigT const&) = delete;
ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
public:
void ingest(std::shared_ptr<Sample> sample) { ingestor->ingest(sample); }
void reset(std::map<std::string, std::string> const& config);
std::map<std::string, std::string> getConfig() const;
};
using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
class SampleCollectorT {
public: // Types
friend struct crossbow::create_static<SampleCollectorT>;
using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
private:
std::vector<IALPCollectorBase*> collectors;
std::map<WaitState, Getter> getSamples;
SampleCollectorT() {}
std::map<std::string_view, std::any> collect(ActorLineage* lineage);
public:
void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
std::shared_ptr<Sample> collect();
void addGetter(WaitState waitState, Getter const& getter) { getSamples[waitState] = getter; };
};
using SampleCollector = crossbow::singleton<SampleCollectorT>;
class SampleCollection_t {
friend struct crossbow::create_static<SampleCollection_t>;
using Lock = std::unique_lock<std::mutex>;
SampleCollection_t() {}
SampleCollector _collector;
mutable std::mutex mutex;
std::atomic<double> windowSize = 0.0;
std::deque<std::shared_ptr<Sample>> data;
ProfilerConfig config;
Reference<ActorLineage> _currentLineage;
public:
/**
* Define how many samples the collection shoul keep. The window size is defined by time dimension.
*
* \param duration How long a sample should be kept in the collection.
*/
void setWindowSize(double duration) { windowSize.store(duration); }
/**
* By default returns reference counted pointers of all samples. A window can be defined in terms of absolute time.
*
* \param from The minimal age of all returned samples.
* \param to The max age of all returned samples.
*/
std::vector<std::shared_ptr<Sample>> get(double from = 0.0, double to = std::numeric_limits<double>::max()) const;
void collect(const Reference<ActorLineage>& lineage);
const SampleCollector& collector() const { return _collector; }
SampleCollector& collector() { return _collector; }
Reference<ActorLineage> getLineage() { return _currentLineage; }
};
using SampleCollection = crossbow::singleton<SampleCollection_t>;
struct ProfilerImpl;
namespace boost {
namespace asio {
// forward declare io_context because including boost asio is super expensive
class io_context;
} // namespace asio
} // namespace boost
class ActorLineageProfilerT {
friend struct crossbow::create_static<ActorLineageProfilerT>;
ProfilerImpl* impl;
SampleCollection collection;
ActorLineageProfilerT();
public:
~ActorLineageProfilerT();
void setFrequency(unsigned frequency);
boost::asio::io_context& context();
};
using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;

View File

@ -0,0 +1,23 @@
/*
* AnnotateActor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/AnnotateActor.h"
std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;

91
fdbclient/AnnotateActor.h Normal file
View File

@ -0,0 +1,91 @@
/*
* AnnotateActor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "flow/flow.h"
#include "flow/network.h"
#include <string_view>
// Used to manually instrument waiting actors to collect samples for the
// sampling profiler.
struct AnnotateActor {
unsigned index;
bool set;
AnnotateActor() : set(false) {}
AnnotateActor(LineageReference* lineage) : set(false) {
#ifdef ENABLE_SAMPLING
if (lineage->getPtr() != 0) {
index = g_network->getActorLineageSet().insert(*lineage);
set = (index != ActorLineageSet::npos);
}
#endif
}
AnnotateActor(const AnnotateActor& other) = delete;
AnnotateActor(AnnotateActor&& other) = delete;
AnnotateActor& operator=(const AnnotateActor& other) = delete;
AnnotateActor& operator=(AnnotateActor&& other) {
if (this == &other) {
return *this;
}
this->index = other.index;
this->set = other.set;
other.set = false;
return *this;
}
~AnnotateActor() {
#ifdef ENABLE_SAMPLING
if (set) {
g_network->getActorLineageSet().erase(index);
}
#endif
}
};
enum class WaitState { Disk, Network, Running };
// usually we shouldn't use `using namespace` in a header file, but literals should be safe as user defined literals
// need to be prefixed with `_`
using namespace std::literals;
constexpr std::string_view to_string(WaitState st) {
switch (st) {
case WaitState::Disk:
return "Disk"sv;
case WaitState::Network:
return "Network"sv;
case WaitState::Running:
return "Running"sv;
default:
return ""sv;
}
}
#ifdef ENABLE_SAMPLING
extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
#endif

View File

@ -330,7 +330,7 @@ void decodeBackupLogValue(Arena& arena,
}
} else {
Version ver = key_version->rangeContaining(logValue.param1).value();
//TraceEvent("ApplyMutation").detail("LogValue", logValue.toString()).detail("Version", version).detail("Ver", ver).detail("Apply", version > ver && ver != invalidVersion);
//TraceEvent("ApplyMutation").detail("LogValue", logValue).detail("Version", version).detail("Ver", ver).detail("Apply", version > ver && ver != invalidVersion);
if (version > ver && ver != invalidVersion) {
if (removePrefix.size()) {
logValue.param1 = logValue.param1.removePrefix(removePrefix);

View File

@ -1,4 +1,7 @@
set(FDBCLIENT_SRCS
ActorLineageProfiler.h
ActorLineageProfiler.cpp
AnnotateActor.cpp
AsyncFileS3BlobStore.actor.cpp
AsyncFileS3BlobStore.actor.h
AsyncTaskThread.actor.cpp
@ -39,6 +42,7 @@ set(FDBCLIENT_SRCS
FDBOptions.h
FDBTypes.cpp
FDBTypes.h
FluentDSampleIngestor.cpp
FileBackupAgent.actor.cpp
GlobalConfig.h
GlobalConfig.actor.h
@ -66,6 +70,8 @@ set(FDBCLIENT_SRCS
MultiVersionTransaction.actor.cpp
MultiVersionTransaction.h
MutationList.h
NameLineage.h
NameLineage.cpp
NativeAPI.actor.cpp
NativeAPI.actor.h
Notified.h
@ -73,6 +79,7 @@ set(FDBCLIENT_SRCS
ParallelStream.actor.h
PaxosConfigTransaction.actor.cpp
PaxosConfigTransaction.h
PImpl.h
SimpleConfigTransaction.actor.cpp
SpecialKeySpace.actor.cpp
SpecialKeySpace.actor.h
@ -101,10 +108,12 @@ set(FDBCLIENT_SRCS
StorageServerInterface.h
Subspace.cpp
Subspace.h
StackLineage.h
StackLineage.cpp
SystemData.cpp
SystemData.h
TagThrottle.actor.cpp
TagThrottle.h
TagThrottle.actor.h
TaskBucket.actor.cpp
TaskBucket.h
TestKnobCollection.cpp
@ -172,8 +181,17 @@ endif()
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
add_dependencies(fdbclient fdboptions fdb_c_options)
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
# Create a separate fdbclient library with sampling enabled. This lets
# fdbserver retain sampling functionality in client code while disabling
# sampling for pure clients.
add_flow_target(STATIC_LIBRARY NAME fdbclient_sampling SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
add_dependencies(fdbclient_sampling fdboptions fdb_c_options)
target_link_libraries(fdbclient_sampling PUBLIC fdbrpc_sampling msgpack)
target_compile_definitions(fdbclient_sampling PRIVATE -DENABLE_SAMPLING)
if(BUILD_AZURE_BACKUP)
target_link_libraries(fdbclient PUBLIC fdbrpc PRIVATE curl uuid azure-storage-lite)
else()
target_link_libraries(fdbclient PUBLIC fdbrpc)
target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite)
endif()

View File

@ -22,6 +22,9 @@
#ifndef FDBCLIENT_CLIENTLOGEVENTS_H
#define FDBCLIENT_CLIENTLOGEVENTS_H
#include "fdbclient/FDBTypes.h"
#include "fdbclient/CommitProxyInterface.h"
namespace FdbClientLogEvents {
enum class EventType {
GET_VERSION_LATENCY = 0,
@ -252,7 +255,7 @@ struct EventCommit : public Event {
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation.toString());
.detail("Mutation", mutation);
}
TraceEvent("TransactionTrace_Commit")
@ -316,7 +319,7 @@ struct EventCommit_V2 : public Event {
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation.toString());
.detail("Mutation", mutation);
}
TraceEvent("TransactionTrace_Commit")
@ -430,7 +433,7 @@ struct EventCommitError : public Event {
.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation.toString());
.detail("Mutation", mutation);
}
TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode);

View File

@ -28,7 +28,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/GlobalConfig.h"
#include "fdbrpc/Stats.h"

View File

@ -116,6 +116,33 @@ std::string KnobValueRef::toString() const {
return std::visit(ToStringFunc{}, value);
}
ConfigDBType configDBTypeFromString(std::string const& str) {
if (str == "disabled") {
return ConfigDBType::DISABLED;
} else if (str == "simple") {
return ConfigDBType::SIMPLE;
} else if (str == "paxos") {
return ConfigDBType::PAXOS;
} else {
TraceEvent(SevWarnAlways, "InvalidConfigDBString");
return ConfigDBType::DISABLED;
}
}
std::string configDBTypeToString(ConfigDBType configDBType) {
switch (configDBType) {
case ConfigDBType::DISABLED:
return "disabled";
case ConfigDBType::SIMPLE:
return "simple";
case ConfigDBType::PAXOS:
return "paxos";
default:
ASSERT(false);
return "";
}
}
TEST_CASE("/fdbclient/ConfigDB/ConfigKey/EncodeDecode") {
Tuple tuple;
tuple << "class-A"_sr

View File

@ -199,8 +199,5 @@ struct ConfigCommitAnnotationRef {
};
using ConfigCommitAnnotation = Standalone<ConfigCommitAnnotationRef>;
enum class UseConfigDB {
DISABLED,
SIMPLE,
PAXOS,
};
ConfigDBType configDBTypeFromString(std::string const&);
std::string configDBTypeToString(ConfigDBType);

View File

@ -55,6 +55,22 @@ bool ConfigGeneration::operator!=(ConfigGeneration const& rhs) const {
return !(*this == rhs);
}
bool ConfigGeneration::operator<(ConfigGeneration const& rhs) const {
if (committedVersion != rhs.committedVersion) {
return committedVersion < rhs.committedVersion;
} else {
return liveVersion < rhs.liveVersion;
}
}
bool ConfigGeneration::operator>(ConfigGeneration const& rhs) const {
if (committedVersion != rhs.committedVersion) {
return committedVersion > rhs.committedVersion;
} else {
return liveVersion > rhs.liveVersion;
}
}
void ConfigTransactionCommitRequest::set(KeyRef key, ValueRef value) {
if (key == configTransactionDescriptionKey) {
annotation.description = KeyRef(arena, value);

View File

@ -28,18 +28,20 @@
#include "flow/flow.h"
struct ConfigGeneration {
// The live version of each node is monotonically increasing
Version liveVersion{ 0 };
// The committedVersion of each node is the version of the last commit made durable.
// Each committedVersion was previously given to clients as a liveVersion, prior to commit.
Version committedVersion{ 0 };
// The live version of each node is monotonically increasing
Version liveVersion{ 0 };
bool operator==(ConfigGeneration const&) const;
bool operator!=(ConfigGeneration const&) const;
bool operator<(ConfigGeneration const&) const;
bool operator>(ConfigGeneration const&) const;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, liveVersion, committedVersion);
serializer(ar, committedVersion, liveVersion);
}
};
@ -57,12 +59,16 @@ struct ConfigTransactionGetGenerationReply {
struct ConfigTransactionGetGenerationRequest {
static constexpr FileIdentifier file_identifier = 138941;
// A hint to catch up lagging nodes:
Optional<Version> lastSeenLiveVersion;
ReplyPromise<ConfigTransactionGetGenerationReply> reply;
ConfigTransactionGetGenerationRequest() = default;
explicit ConfigTransactionGetGenerationRequest(Optional<Version> const& lastSeenLiveVersion)
: lastSeenLiveVersion(lastSeenLiveVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reply);
serializer(ar, lastSeenLiveVersion, reply);
}
};

View File

@ -2355,7 +2355,7 @@ std::string getDRMutationStreamId(StatusObjectReader statusObj, const char* cont
}
}
}
TraceEvent(SevWarn, "DBA_TagNotPresentInStatus").detail("Tag", tagName.toString()).detail("Context", context);
TraceEvent(SevWarn, "DBA_TagNotPresentInStatus").detail("Tag", tagName).detail("Context", context);
throw backup_error();
} catch (std::runtime_error& e) {
TraceEvent(SevWarn, "DBA_GetDRMutationStreamIdFail").detail("Error", e.what());

View File

@ -437,6 +437,10 @@ public:
// Requests to the storage server will no longer be duplicated to its pair TSS.
void removeTssMapping(StorageServerInterface const& ssi);
// used in template functions to create a transaction
using TransactionT = ReadYourWritesTransaction;
Reference<TransactionT> createTransaction();
private:
std::unordered_map<KeyRef, Reference<WatchMetadata>> watchMap;
};

View File

@ -41,8 +41,8 @@ typedef UID SpanID;
enum {
tagLocalitySpecial = -1, // tag with this locality means it is invalidTag (id=0), txsTag (id=1), or cacheTag (id=2)
tagLocalityLogRouter = -2,
tagLocalityRemoteLog = -3, // tag created by log router for remote tLogs
tagLocalityUpgraded = -4,
tagLocalityRemoteLog = -3, // tag created by log router for remote (aka. not in Primary DC) tLogs
tagLocalityUpgraded = -4, // tlogs with old log format
tagLocalitySatellite = -5,
tagLocalityLogRouterMapped = -6, // The pseudo tag used by log routers to pop the real LogRouter tag (i.e., -2)
tagLocalityTxs = -7,

View File

@ -0,0 +1,265 @@
/*
* FluentDSampleIngestor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/ActorLineageProfiler.h"
#include <boost/asio.hpp>
#include <boost/asio/co_spawn.hpp>
#include <msgpack.hpp>
namespace {
boost::asio::ip::address ipAddress(IPAddress const& n) {
if (n.isV6()) {
return boost::asio::ip::address_v6(n.toV6());
} else {
return boost::asio::ip::address_v4(n.toV4());
}
}
template <class Protocol>
boost::asio::ip::basic_endpoint<Protocol> toEndpoint(NetworkAddress const n) {
return boost::asio::ip::basic_endpoint<Protocol>(ipAddress(n.ip), n.port);
}
struct FluentDSocket {
virtual ~FluentDSocket() {}
virtual void connect(NetworkAddress const& endpoint) = 0;
virtual void send(std::shared_ptr<Sample> const& sample) = 0;
virtual const boost::system::error_code& failed() const = 0;
};
template <class Protocol, class Callback>
class SampleSender : public std::enable_shared_from_this<SampleSender<Protocol, Callback>> {
using Socket = typename Protocol::socket;
using Iter = typename decltype(Sample::data)::iterator;
Socket& socket;
Callback callback;
Iter iter, end;
std::shared_ptr<Sample> sample_; // to keep from being deallocated
struct Buf {
const char* data;
const unsigned size;
Buf(const char* data, unsigned size) : data(data), size(size) {}
Buf(Buf const&) = delete;
Buf& operator=(Buf const&) = delete;
~Buf() { delete[] data; }
};
void sendCompletionHandler(boost::system::error_code const& ec) {
if (ec) {
callback(ec);
} else {
++iter;
sendNext();
}
}
void send(boost::asio::ip::tcp::socket& socket, std::shared_ptr<Buf> const& buf) {
boost::asio::async_write(
socket,
boost::asio::const_buffer(buf->data, buf->size),
[buf, this](auto const& ec, size_t) {
this->sendCompletionHandler(ec);
});
}
void send(boost::asio::ip::udp::socket& socket, std::shared_ptr<Buf> const& buf) {
socket.async_send(
boost::asio::const_buffer(buf->data, buf->size),
[buf, this](auto const& ec, size_t) { this->sendCompletionHandler(ec); });
}
void sendNext() {
if (iter == end) {
callback(boost::system::error_code());
return;
}
// 1. calculate size of buffer
unsigned size = 1; // 1 for fixmap identifier byte
auto waitState = to_string(iter->first);
if (waitState.size() < 32) {
size += waitState.size() + 1;
} else {
size += waitState.size() + 2;
}
size += iter->second.second;
// 2. allocate the buffer
std::unique_ptr<char[]> buf(new char[size]);
unsigned off = 0;
// 3. serialize fixmap
buf[off++] = 0x81; // map of size 1
// 3.1 serialize key
if (waitState.size() < 32) {
buf[off++] = 0xa0 + waitState.size(); // fixstr
} else {
buf[off++] = 0xd9;
buf[off++] = char(waitState.size());
}
memcpy(buf.get() + off, waitState.data(), waitState.size());
off += waitState.size();
// 3.2 append serialized value
memcpy(buf.get() + off, iter->second.first, iter->second.second);
// 4. send the result to fluentd
send(socket, std::make_shared<Buf>(buf.release(), size));
}
public:
SampleSender(Socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample)
: socket(socket),
callback(callback),
iter(sample->data.begin()),
end(sample->data.end()),
sample_(sample) {
sendNext();
}
};
// Sample function to make instanciation of SampleSender easier
template <class Protocol, class Callback>
std::shared_ptr<SampleSender<Protocol, Callback>> makeSampleSender(typename Protocol::socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample) {
return std::make_shared<SampleSender<Protocol, Callback>>(socket, callback, sample);
}
template <class Protocol>
struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
static constexpr unsigned MAX_QUEUE_SIZE = 100;
boost::asio::io_context& context;
typename Protocol::socket socket;
FluentDSocketImpl(boost::asio::io_context& context) : context(context), socket(context) {}
bool ready = false;
std::deque<std::shared_ptr<Sample>> queue;
boost::system::error_code _failed;
const boost::system::error_code& failed() const override { return _failed; }
void sendCompletionHandler(boost::system::error_code const& ec) {
if (ec) {
// TODO: trace error
_failed = ec;
return;
}
if (queue.empty()) {
ready = true;
} else {
auto sample = queue.front();
queue.pop_front();
sendImpl(sample);
}
}
void sendImpl(std::shared_ptr<Sample> const& sample) {
makeSampleSender<Protocol>(socket, [self = this->shared_from_this()](boost::system::error_code const& ec){
self->sendCompletionHandler(ec);
}, sample);
}
void send(std::shared_ptr<Sample> const& sample) override {
if (_failed) {
return;
}
if (ready) {
ready = false;
sendImpl(sample);
} else {
if (queue.size() < MAX_QUEUE_SIZE) {
queue.push_back(sample);
} // TODO: else trace a warning
}
}
void connect(NetworkAddress const& endpoint) override {
auto to = toEndpoint<Protocol>(endpoint);
socket.async_connect(to, [self = this->shared_from_this()](boost::system::error_code const& ec) {
if (ec) {
// TODO: error handling
self->_failed = ec;
return;
}
self->ready = true;
});
}
};
} // namespace
struct FluentDIngestorImpl {
using Protocol = FluentDIngestor::Protocol;
Protocol protocol;
NetworkAddress endpoint;
boost::asio::io_context& io_context;
std::shared_ptr<FluentDSocket> socket;
boost::asio::steady_timer retryTimer;
FluentDIngestorImpl(Protocol protocol, NetworkAddress const& endpoint)
: protocol(protocol), endpoint(endpoint), io_context(ActorLineageProfiler::instance().context()),
retryTimer(io_context) {
connect();
}
~FluentDIngestorImpl() { retryTimer.cancel(); }
void connect() {
switch (protocol) {
case Protocol::TCP:
socket.reset(new FluentDSocketImpl<boost::asio::ip::tcp>(io_context));
break;
case Protocol::UDP:
socket.reset(new FluentDSocketImpl<boost::asio::ip::udp>(io_context));
break;
}
socket->connect(endpoint);
}
void retry() {
retryTimer = boost::asio::steady_timer(io_context, std::chrono::seconds(1));
retryTimer.async_wait([this](auto const& ec) {
if (ec) {
return;
}
connect();
});
socket.reset();
}
};
FluentDIngestor::~FluentDIngestor() {
delete impl;
}
FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
: impl(new FluentDIngestorImpl(protocol, endpoint)) {}
void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
if (!impl->socket) {
// the connection failed in the past and we wait for a timeout before we retry
return;
} else if (impl->socket->failed()) {
impl->retry();
return;
} else {
impl->socket->send(sample);
}
}
void FluentDIngestor::getConfig(std::map<std::string, std::string>& res) const {
res["ingestor"] = "fluentd";
res["collector_endpoint"] = impl->endpoint.toString();
res["collector_protocol"] = impl->protocol == Protocol::TCP ? "tcp" : "udp";
}

View File

@ -34,6 +34,9 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
GlobalConfig::GlobalConfig(Database& cx) : cx(cx), lastUpdate(0) {}
GlobalConfig& GlobalConfig::globalConfig() {

View File

@ -28,6 +28,7 @@
#include <any>
#include <map>
#include <optional>
#include <type_traits>
#include <unordered_map>
@ -49,6 +50,9 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
extern const KeyRef transactionTagSampleRate;
extern const KeyRef transactionTagSampleCost;
extern const KeyRef samplingFrequency;
extern const KeyRef samplingWindow;
// Structure used to hold the values stored by global configuration. The arena
// is used as memory to store both the key and the value (the value is only
// stored in the arena if it is an object; primitives are just copied).
@ -78,6 +82,7 @@ public:
g_network->setGlobal(INetwork::enGlobalConfig, config);
config->_updater = updater(config, dbInfo);
// Bind changes in `db` to the `dbInfoChanged` AsyncTrigger.
// TODO: Change AsyncTrigger to a Reference
forward(db, std::addressof(config->dbInfoChanged));
} else {
GlobalConfig* config = reinterpret_cast<GlobalConfig*>(g_network->global(INetwork::enGlobalConfig));
@ -137,9 +142,11 @@ public:
Future<Void> onChange();
// Calls \ref fn when the value associated with \ref key is changed. \ref
// fn is passed the updated value for the key, or an empty optional if the
// key has been cleared. If the value is an allocated object, its memory
// remains in the control of the global configuration.
// key should be one of the string literals defined at the top of
// GlobalConfig.actor.cpp, to ensure memory validity. \ref fn is passed the
// updated value for the key, or an empty optional if the key has been
// cleared. If the value is an allocated object, its memory remains in the
// control of global configuration.
void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
private:
@ -171,6 +178,7 @@ private:
AsyncTrigger configChanged;
std::unordered_map<StringRef, Reference<ConfigValue>> data;
Version lastUpdate;
// The key should be a global config string literal key (see the top of this class).
std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
};

View File

@ -88,6 +88,9 @@ public:
virtual void addref() = 0;
virtual void delref() = 0;
// used in template functions as returned Future type
template<class Type> using FutureT = ThreadFuture<Type>;
};
// An interface that represents a connection to a cluster made by a client
@ -115,6 +118,9 @@ public:
virtual ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) = 0;
// Management API, create snapshot
virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;
// used in template functions as the Transaction type that can be created through createTransaction()
using TransactionT = ITransaction;
};
// An interface that presents the top-level FDB client API as exposed through the C bindings

View File

@ -38,3 +38,16 @@ ISingleThreadTransaction* ISingleThreadTransaction::allocateOnForeignThread(Type
ASSERT(false);
return nullptr;
}
Reference<ISingleThreadTransaction> ISingleThreadTransaction::create(Type type, Database const& cx) {
Reference<ISingleThreadTransaction> result;
if (type == Type::RYW) {
result = makeReference<ReadYourWritesTransaction>();
} else if (type == Type::SIMPLE_CONFIG) {
result = makeReference<SimpleConfigTransaction>();
} else {
result = makeReference<PaxosConfigTransaction>();
}
result->setDatabase(cx);
return result;
}

View File

@ -44,7 +44,8 @@ public:
PAXOS_CONFIG,
};
static ISingleThreadTransaction* allocateOnForeignThread(Type type);
static ISingleThreadTransaction* allocateOnForeignThread(Type);
static Reference<ISingleThreadTransaction> create(Type, Database const&);
virtual void setDatabase(Database const&) = 0;
virtual void setVersion(Version v) = 0;

25
fdbclient/NameLineage.cpp Normal file
View File

@ -0,0 +1,25 @@
/*
* NameLineage.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/NameLineage.h"
namespace {
NameLineageCollector nameLineageCollector;
}

42
fdbclient/NameLineage.h Normal file
View File

@ -0,0 +1,42 @@
/*
* NameLineage.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string_view>
#include "fdbclient/ActorLineageProfiler.h"
struct NameLineage : LineageProperties<NameLineage> {
static constexpr std::string_view name = "Actor"sv;
const char* actorName;
};
struct NameLineageCollector : IALPCollector<NameLineage> {
NameLineageCollector() : IALPCollector() {}
std::optional<std::any> collect(ActorLineage* lineage) override {
auto str = lineage->get(&NameLineage::actorName);
if (str.has_value()) {
return std::string_view(*str, std::strlen(*str));
} else {
return {};
}
}
};

View File

@ -32,6 +32,8 @@
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/AnnotateActor.h"
#include "fdbclient/Atomic.h"
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/CoordinationInterface.h"
@ -42,6 +44,7 @@
#include "fdbclient/KeyBackedTypes.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NameLineage.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/MonitorLeader.h"
#include "fdbclient/MutationList.h"
@ -50,6 +53,7 @@
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbclient/versions.h"
#include "fdbrpc/LoadBalance.h"
#include "fdbrpc/Net2FileSystem.h"
@ -87,6 +91,9 @@ using std::pair;
namespace {
TransactionLineageCollector transactionLineageCollector;
NameLineageCollector nameLineageCollector;
template <class Interface, class Request>
Future<REPLY_TYPE(Request)> loadBalance(
DatabaseContext* ctx,
@ -147,16 +154,25 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe
result->second = tssi;
}
// data requests duplicated for load and data comparison
queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics));
// non-data requests duplicated for load
queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.splitMetrics.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.splitMetrics.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics));
}
}
@ -168,8 +184,12 @@ void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) {
queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.splitMetrics.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first());
}
}
@ -353,7 +373,8 @@ void traceTSSErrors(const char* name, UID tssId, const std::unordered_map<int, u
ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
state double lastLogged = 0;
loop {
wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace));
wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace));
TraceEvent ev("TransactionMetrics", cx->dbId);
ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
@ -364,6 +385,7 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
cx->cc.logToTraceEvent(ev);
ev.detail("LocationCacheEntryCount", cx->locationCache.size());
ev.detail("MeanLatency", cx->latencies.mean())
.detail("MedianLatency", cx->latencies.median())
.detail("Latency90", cx->latencies.percentile(0.90))
@ -1260,6 +1282,14 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
std::make_unique<DataDistributionImpl>(
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::ACTORLINEAGE,
SpecialKeySpace::IMPLTYPE::READONLY,
std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<ActorProfilerConf>(SpecialKeySpace::getModuleRange(
SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF)));
}
if (apiVersionAtLeast(630)) {
registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
@ -1761,6 +1791,8 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
auto database = Database(db);
GlobalConfig::create(
database, Reference<AsyncVar<ClientDBInfo> const>(clientInfo), std::addressof(clientInfo->get()));
GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
return database;
}
@ -2744,8 +2776,10 @@ ACTOR Future<Version> watchValue(Future<Version> version,
cx->invalidateCache(key);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
// clang-format off
TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
TEST(e.code() == error_code_process_behind); // The storage servers are all behind
// clang-format on
wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
// it was cancelled
@ -3333,6 +3367,7 @@ ACTOR Future<RangeResult> getRange(Database cx,
throw deterministicRandom()->randomChoice(
std::vector<Error>{ transaction_too_old(), future_version() });
}
// state AnnotateActor annotation(currentLineage);
GetKeyValuesReply _rep =
wait(loadBalance(cx.getPtr(),
beginServer.second,
@ -4093,8 +4128,7 @@ SpanID generateSpanID(int transactionTracingEnabled) {
}
}
Transaction::Transaction()
: info(TaskPriority::DefaultEndpoint, generateSpanID(true)), span(info.spanID, "Transaction"_loc) {}
Transaction::Transaction() : info(TaskPriority::DefaultEndpoint, generateSpanID(true)) {}
Transaction::Transaction(Database const& cx)
: info(cx->taskID, generateSpanID(cx->transactionTracingEnabled)), numErrors(0), options(cx),
@ -6340,7 +6374,7 @@ void enableClientInfoLogging() {
}
ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID) {
TraceEvent("SnapCreateEnter").detail("SnapCmd", snapCmd.toString()).detail("UID", snapUID);
TraceEvent("SnapCreateEnter").detail("SnapCmd", snapCmd).detail("UID", snapUID);
try {
loop {
choose {
@ -6350,7 +6384,7 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
ProxySnapRequest(snapCmd, snapUID, snapUID),
cx->taskID,
AtMostOnce::True))) {
TraceEvent("SnapCreateExit").detail("SnapCmd", snapCmd.toString()).detail("UID", snapUID);
TraceEvent("SnapCreateExit").detail("SnapCmd", snapCmd).detail("UID", snapUID);
return Void();
}
}
@ -6530,3 +6564,7 @@ ACTOR Future<Void> setPerpetualStorageWiggle(Database cx, bool enable, LockAware
}
return Void();
}
Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
return makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(this)));
}

View File

@ -106,6 +106,7 @@ public:
inline DatabaseContext* getPtr() const { return db.getPtr(); }
inline DatabaseContext* extractPtr() { return db.extractPtr(); }
DatabaseContext* operator->() const { return db.getPtr(); }
Reference<DatabaseContext> getReference() const { return db; }
const UniqueOrderedOptionList<FDBTransactionOptions>& getTransactionDefaults() const;
@ -180,6 +181,9 @@ struct TransactionInfo {
// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;
// Only available so that Transaction can have a default constructor, for use in state variables
TransactionInfo() : taskID(), spanID(), useProvisionalProxies() {}
explicit TransactionInfo(TaskPriority taskID, SpanID spanID)
: taskID(taskID), spanID(spanID), useProvisionalProxies(false) {}
};

42
fdbclient/PImpl.h Normal file
View File

@ -0,0 +1,42 @@
/*
* PImpl.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
template <class T>
class PImpl {
std::unique_ptr<T> impl;
struct ConstructorTag {};
template <class... Args>
PImpl(ConstructorTag, Args&&... args) : impl(std::make_unique<T>(std::forward<Args>(args)...)) {}
public:
PImpl() = default;
template <class... Args>
static PImpl create(Args&&... args) {
return PImpl(ConstructorTag{}, std::forward<Args>(args)...);
}
T& operator*() { return *impl; }
T const& operator*() const { return *impl; }
T* operator->() { return impl.get(); }
T const* operator->() const { return impl.get(); }
};

View File

@ -22,24 +22,87 @@
#include "fdbclient/PaxosConfigTransaction.h"
#include "flow/actorcompiler.h" // must be last include
// TODO: Some replicas may reply after quorum has already been achieved, and we may want to add them to the readReplicas
// list
class GetGenerationQuorum {
public:
struct Result {
ConfigGeneration generation;
std::vector<ConfigTransactionInterface> readReplicas;
Result(ConfigGeneration const& generation, std::vector<ConfigTransactionInterface> const& readReplicas)
: generation(generation), readReplicas(readReplicas) {}
Result() = default;
};
private:
std::vector<Future<Void>> futures;
std::map<ConfigGeneration, std::vector<ConfigTransactionInterface>> seenGenerations;
Promise<Result> result;
size_t totalRepliesReceived{ 0 };
size_t maxAgreement{ 0 };
size_t size{ 0 };
Optional<Version> lastSeenLiveVersion;
ACTOR static Future<Void> addRequestActor(GetGenerationQuorum* self, ConfigTransactionInterface cti) {
ConfigTransactionGetGenerationReply reply =
wait(cti.getGeneration.getReply(ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion }));
++self->totalRepliesReceived;
auto gen = reply.generation;
self->lastSeenLiveVersion = std::max(gen.liveVersion, self->lastSeenLiveVersion.orDefault(::invalidVersion));
auto& replicas = self->seenGenerations[gen];
replicas.push_back(cti);
self->maxAgreement = std::max(replicas.size(), self->maxAgreement);
if (replicas.size() == self->size / 2 + 1) {
self->result.send(Result{ gen, replicas });
} else if (self->maxAgreement + (self->size - self->totalRepliesReceived) < (self->size / 2 + 1)) {
self->result.sendError(failed_to_reach_quorum());
}
return Void();
}
public:
GetGenerationQuorum(size_t size, Optional<Version> const& lastSeenLiveVersion)
: size(size), lastSeenLiveVersion(lastSeenLiveVersion) {
futures.reserve(size);
}
void addRequest(ConfigTransactionInterface cti) { futures.push_back(addRequestActor(this, cti)); }
Future<Result> getResult() const { return result.getFuture(); }
Optional<Version> getLastSeenLiveVersion() const { return lastSeenLiveVersion; }
};
class PaxosConfigTransactionImpl {
ConfigTransactionCommitRequest toCommit;
Future<ConfigGeneration> getGenerationFuture;
Future<GetGenerationQuorum::Result> getGenerationFuture;
std::vector<ConfigTransactionInterface> ctis;
int numRetries{ 0 };
bool committed{ false };
Optional<Version> lastSeenLiveVersion;
Optional<UID> dID;
Database cx;
std::vector<ConfigTransactionInterface> readReplicas;
ACTOR static Future<ConfigGeneration> getGeneration(PaxosConfigTransactionImpl* self) {
state std::vector<Future<ConfigTransactionGetGenerationReply>> getGenerationFutures;
getGenerationFutures.reserve(self->ctis.size());
for (auto const& cti : self->ctis) {
getGenerationFutures.push_back(cti.getGeneration.getReply(ConfigTransactionGetGenerationRequest{}));
ACTOR static Future<GetGenerationQuorum::Result> getGeneration(PaxosConfigTransactionImpl* self) {
state GetGenerationQuorum quorum(self->ctis.size(), self->lastSeenLiveVersion);
state int retries = 0;
loop {
for (auto const& cti : self->ctis) {
quorum.addRequest(cti);
}
try {
state GetGenerationQuorum::Result result = wait(quorum.getResult());
wait(delay(0.0)); // Let reply callback actors finish before destructing quorum
return result;
} catch (Error& e) {
if (e.code() == error_code_failed_to_reach_quorum) {
TEST(true); // Failed to reach quorum getting generation
wait(delayJittered(0.01 * (1 << retries)));
++retries;
} else {
throw e;
}
}
self->lastSeenLiveVersion = quorum.getLastSeenLiveVersion();
}
// FIXME: Must tolerate failures and disagreement
wait(waitForAll(getGenerationFutures));
return getGenerationFutures[0].get().generation;
}
ACTOR static Future<Optional<Value>> get(PaxosConfigTransactionImpl* self, Key key) {
@ -47,10 +110,10 @@ class PaxosConfigTransactionImpl {
self->getGenerationFuture = getGeneration(self);
}
state ConfigKey configKey = ConfigKey::decodeKey(key);
ConfigGeneration generation = wait(self->getGenerationFuture);
GetGenerationQuorum::Result genResult = wait(self->getGenerationFuture);
// TODO: Load balance
ConfigTransactionGetReply reply =
wait(self->ctis[0].get.getReply(ConfigTransactionGetRequest{ generation, configKey }));
ConfigTransactionGetReply reply = wait(
genResult.readReplicas[0].get.getReply(ConfigTransactionGetRequest{ genResult.generation, configKey }));
if (reply.value.present()) {
return reply.value.get().toValue();
} else {
@ -62,10 +125,10 @@ class PaxosConfigTransactionImpl {
if (!self->getGenerationFuture.isValid()) {
self->getGenerationFuture = getGeneration(self);
}
ConfigGeneration generation = wait(self->getGenerationFuture);
GetGenerationQuorum::Result genResult = wait(self->getGenerationFuture);
// TODO: Load balance
ConfigTransactionGetConfigClassesReply reply =
wait(self->ctis[0].getClasses.getReply(ConfigTransactionGetConfigClassesRequest{ generation }));
ConfigTransactionGetConfigClassesReply reply = wait(genResult.readReplicas[0].getClasses.getReply(
ConfigTransactionGetConfigClassesRequest{ genResult.generation }));
RangeResult result;
result.reserve(result.arena(), reply.configClasses.size());
for (const auto& configClass : reply.configClasses) {
@ -78,10 +141,10 @@ class PaxosConfigTransactionImpl {
if (!self->getGenerationFuture.isValid()) {
self->getGenerationFuture = getGeneration(self);
}
ConfigGeneration generation = wait(self->getGenerationFuture);
GetGenerationQuorum::Result genResult = wait(self->getGenerationFuture);
// TODO: Load balance
ConfigTransactionGetKnobsReply reply =
wait(self->ctis[0].getKnobs.getReply(ConfigTransactionGetKnobsRequest{ generation, configClass }));
ConfigTransactionGetKnobsReply reply = wait(genResult.readReplicas[0].getKnobs.getReply(
ConfigTransactionGetKnobsRequest{ genResult.generation, configClass }));
RangeResult result;
result.reserve(result.arena(), reply.knobNames.size());
for (const auto& knobName : reply.knobNames) {
@ -94,10 +157,13 @@ class PaxosConfigTransactionImpl {
if (!self->getGenerationFuture.isValid()) {
self->getGenerationFuture = getGeneration(self);
}
wait(store(self->toCommit.generation, self->getGenerationFuture));
GetGenerationQuorum::Result genResult = wait(self->getGenerationFuture);
self->toCommit.generation = genResult.generation;
self->toCommit.annotation.timestamp = now();
std::vector<Future<Void>> commitFutures;
commitFutures.reserve(self->ctis.size());
// Send commit message to all replicas, even those that did not return the used replica.
// This way, slow replicas are kept up date.
for (const auto& cti : self->ctis) {
commitFutures.push_back(cti.commit.getReply(self->toCommit));
}
@ -112,18 +178,20 @@ public:
if (!getGenerationFuture.isValid()) {
getGenerationFuture = getGeneration(this);
}
return map(getGenerationFuture, [](auto const& gen) { return gen.committedVersion; });
return map(getGenerationFuture, [](auto const& genResult) { return genResult.generation.committedVersion; });
}
Optional<Version> getCachedReadVersion() const {
if (getGenerationFuture.isValid() && getGenerationFuture.isReady() && !getGenerationFuture.isError()) {
return getGenerationFuture.get().committedVersion;
return getGenerationFuture.get().generation.committedVersion;
} else {
return {};
}
}
Version getCommittedVersion() const { return committed ? getGenerationFuture.get().liveVersion : ::invalidVersion; }
Version getCommittedVersion() const {
return committed ? getGenerationFuture.get().generation.liveVersion : ::invalidVersion;
}
int64_t getApproximateSize() const { return toCommit.expectedSize(); }
@ -158,7 +226,7 @@ public:
void debugTransaction(UID dID) { this->dID = dID; }
void reset() {
getGenerationFuture = Future<ConfigGeneration>{};
getGenerationFuture = Future<GetGenerationQuorum::Result>{};
toCommit = {};
committed = false;
}
@ -192,15 +260,15 @@ public:
};
Future<Version> PaxosConfigTransaction::getReadVersion() {
return impl().getReadVersion();
return impl->getReadVersion();
}
Optional<Version> PaxosConfigTransaction::getCachedReadVersion() const {
return impl().getCachedReadVersion();
return impl->getCachedReadVersion();
}
Future<Optional<Value>> PaxosConfigTransaction::get(Key const& key, Snapshot) {
return impl().get(key);
return impl->get(key);
}
Future<RangeResult> PaxosConfigTransaction::getRange(KeySelector const& begin,
@ -211,7 +279,7 @@ Future<RangeResult> PaxosConfigTransaction::getRange(KeySelector const& begin,
if (reverse) {
throw client_invalid_operation();
}
return impl().getRange(KeyRangeRef(begin.getKey(), end.getKey()));
return impl->getRange(KeyRangeRef(begin.getKey(), end.getKey()));
}
Future<RangeResult> PaxosConfigTransaction::getRange(KeySelector begin,
@ -222,27 +290,27 @@ Future<RangeResult> PaxosConfigTransaction::getRange(KeySelector begin,
if (reverse) {
throw client_invalid_operation();
}
return impl().getRange(KeyRangeRef(begin.getKey(), end.getKey()));
return impl->getRange(KeyRangeRef(begin.getKey(), end.getKey()));
}
void PaxosConfigTransaction::set(KeyRef const& key, ValueRef const& value) {
return impl().set(key, value);
return impl->set(key, value);
}
void PaxosConfigTransaction::clear(KeyRef const& key) {
return impl().clear(key);
return impl->clear(key);
}
Future<Void> PaxosConfigTransaction::commit() {
return impl().commit();
return impl->commit();
}
Version PaxosConfigTransaction::getCommittedVersion() const {
return impl().getCommittedVersion();
return impl->getCommittedVersion();
}
int64_t PaxosConfigTransaction::getApproximateSize() const {
return impl().getApproximateSize();
return impl->getApproximateSize();
}
void PaxosConfigTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
@ -250,7 +318,7 @@ void PaxosConfigTransaction::setOption(FDBTransactionOptions::Option option, Opt
}
Future<Void> PaxosConfigTransaction::onError(Error const& e) {
return impl().onError(e);
return impl->onError(e);
}
void PaxosConfigTransaction::cancel() {
@ -259,28 +327,28 @@ void PaxosConfigTransaction::cancel() {
}
void PaxosConfigTransaction::reset() {
impl().reset();
impl->reset();
}
void PaxosConfigTransaction::fullReset() {
impl().fullReset();
impl->fullReset();
}
void PaxosConfigTransaction::debugTransaction(UID dID) {
impl().debugTransaction(dID);
impl->debugTransaction(dID);
}
void PaxosConfigTransaction::checkDeferredError() const {
impl().checkDeferredError(deferredError);
impl->checkDeferredError(deferredError);
}
PaxosConfigTransaction::PaxosConfigTransaction(std::vector<ConfigTransactionInterface> const& ctis)
: _impl(std::make_unique<PaxosConfigTransactionImpl>(ctis)) {}
: impl(PImpl<PaxosConfigTransactionImpl>::create(ctis)) {}
PaxosConfigTransaction::PaxosConfigTransaction() = default;
PaxosConfigTransaction::~PaxosConfigTransaction() = default;
void PaxosConfigTransaction::setDatabase(Database const& cx) {
_impl = std::make_unique<PaxosConfigTransactionImpl>(cx);
impl = PImpl<PaxosConfigTransactionImpl>::create(cx);
}

View File

@ -23,14 +23,13 @@
#include <memory>
#include "fdbclient/IConfigTransaction.h"
#include "fdbclient/PImpl.h"
/*
* Fault-tolerant configuration transaction implementation
*/
class PaxosConfigTransaction final : public IConfigTransaction, public FastAllocated<PaxosConfigTransaction> {
std::unique_ptr<class PaxosConfigTransactionImpl> _impl;
PaxosConfigTransactionImpl const& impl() const { return *_impl; }
PaxosConfigTransactionImpl& impl() { return *_impl; }
PImpl<class PaxosConfigTransactionImpl> impl;
public:
PaxosConfigTransaction(std::vector<ConfigTransactionInterface> const&);

View File

@ -0,0 +1,81 @@
/*
* ProcessInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/AnnotateActor.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
constexpr UID WLTOKEN_PROCESS(-1, 12);
struct ProcessInterface {
constexpr static FileIdentifier file_identifier = 985636;
RequestStream<struct GetProcessInterfaceRequest> getInterface;
RequestStream<struct ActorLineageRequest> actorLineage;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, actorLineage);
}
};
struct GetProcessInterfaceRequest {
constexpr static FileIdentifier file_identifier = 7632546;
ReplyPromise<ProcessInterface> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reply);
}
};
// This type is used to send serialized sample data over the network.
struct SerializedSample {
constexpr static FileIdentifier file_identifier = 15785634;
double time;
std::unordered_map<WaitState, std::string> data;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, time, data);
}
};
struct ActorLineageReply {
constexpr static FileIdentifier file_identifier = 1887656;
std::vector<SerializedSample> samples;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, samples);
}
};
struct ActorLineageRequest {
constexpr static FileIdentifier file_identifier = 11654765;
WaitState waitStateStart, waitStateEnd;
time_t timeStart, timeEnd;
ReplyPromise<ActorLineageReply> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, reply);
}
};

View File

@ -175,6 +175,9 @@ public:
void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; }
Transaction& getTransaction() { return tr; }
// used in template functions as returned Future type
template<typename Type> using FutureT = Future<Type>;
private:
friend class RYWImpl;

View File

@ -64,6 +64,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 );
init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473
init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120;
init( PEEK_USING_STREAMING, true );
init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2;
init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 );
init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000;
@ -627,6 +628,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_TOO_LONG_TIME_CRITERIA, 300.0 );
init( MAX_STORAGE_COMMIT_TIME, 120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;

View File

@ -41,6 +41,7 @@ public:
// often, so that versions always advance smoothly
// TLogs
bool PEEK_USING_STREAMING;
double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time
double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin
double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification
@ -563,6 +564,7 @@ public:
double FETCH_KEYS_TOO_LONG_TIME_CRITERIA;
double MAX_STORAGE_COMMIT_TIME;
int64_t RANGESTREAM_LIMIT_BYTES;
bool ENABLE_CLEAR_RANGE_EAGER_READS;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;

View File

@ -198,15 +198,15 @@ public:
}; // SimpleConfigTransactionImpl
Future<Version> SimpleConfigTransaction::getReadVersion() {
return impl().getReadVersion();
return impl->getReadVersion();
}
Optional<Version> SimpleConfigTransaction::getCachedReadVersion() const {
return impl().getCachedReadVersion();
return impl->getCachedReadVersion();
}
Future<Optional<Value>> SimpleConfigTransaction::get(Key const& key, Snapshot snapshot) {
return impl().get(key);
return impl->get(key);
}
Future<RangeResult> SimpleConfigTransaction::getRange(KeySelector const& begin,
@ -217,7 +217,7 @@ Future<RangeResult> SimpleConfigTransaction::getRange(KeySelector const& begin,
if (reverse) {
throw client_invalid_operation();
}
return impl().getRange(KeyRangeRef(begin.getKey(), end.getKey()));
return impl->getRange(KeyRangeRef(begin.getKey(), end.getKey()));
}
Future<RangeResult> SimpleConfigTransaction::getRange(KeySelector begin,
@ -228,27 +228,27 @@ Future<RangeResult> SimpleConfigTransaction::getRange(KeySelector begin,
if (reverse) {
throw client_invalid_operation();
}
return impl().getRange(KeyRangeRef(begin.getKey(), end.getKey()));
return impl->getRange(KeyRangeRef(begin.getKey(), end.getKey()));
}
void SimpleConfigTransaction::set(KeyRef const& key, ValueRef const& value) {
impl().set(key, value);
impl->set(key, value);
}
void SimpleConfigTransaction::clear(KeyRef const& key) {
impl().clear(key);
impl->clear(key);
}
Future<Void> SimpleConfigTransaction::commit() {
return impl().commit();
return impl->commit();
}
Version SimpleConfigTransaction::getCommittedVersion() const {
return impl().getCommittedVersion();
return impl->getCommittedVersion();
}
int64_t SimpleConfigTransaction::getApproximateSize() const {
return impl().getApproximateSize();
return impl->getApproximateSize();
}
void SimpleConfigTransaction::setOption(FDBTransactionOptions::Option option, Optional<StringRef> value) {
@ -256,7 +256,7 @@ void SimpleConfigTransaction::setOption(FDBTransactionOptions::Option option, Op
}
Future<Void> SimpleConfigTransaction::onError(Error const& e) {
return impl().onError(e);
return impl->onError(e);
}
void SimpleConfigTransaction::cancel() {
@ -265,27 +265,27 @@ void SimpleConfigTransaction::cancel() {
}
void SimpleConfigTransaction::reset() {
return impl().reset();
return impl->reset();
}
void SimpleConfigTransaction::fullReset() {
return impl().fullReset();
return impl->fullReset();
}
void SimpleConfigTransaction::debugTransaction(UID dID) {
impl().debugTransaction(dID);
impl->debugTransaction(dID);
}
void SimpleConfigTransaction::checkDeferredError() const {
impl().checkDeferredError(deferredError);
impl->checkDeferredError(deferredError);
}
void SimpleConfigTransaction::setDatabase(Database const& cx) {
_impl = std::make_unique<SimpleConfigTransactionImpl>(cx);
impl = PImpl<SimpleConfigTransactionImpl>::create(cx);
}
SimpleConfigTransaction::SimpleConfigTransaction(ConfigTransactionInterface const& cti)
: _impl(std::make_unique<SimpleConfigTransactionImpl>(cti)) {}
: impl(PImpl<SimpleConfigTransactionImpl>::create(cti)) {}
SimpleConfigTransaction::SimpleConfigTransaction() = default;

View File

@ -27,6 +27,7 @@
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/IConfigTransaction.h"
#include "fdbclient/PImpl.h"
#include "flow/Error.h"
#include "flow/flow.h"
@ -36,9 +37,7 @@
* (the lowest coordinator by IP address), so there is no fault tolerance.
*/
class SimpleConfigTransaction final : public IConfigTransaction, public FastAllocated<SimpleConfigTransaction> {
std::unique_ptr<class SimpleConfigTransactionImpl> _impl;
SimpleConfigTransactionImpl const& impl() const { return *_impl; }
SimpleConfigTransactionImpl& impl() { return *_impl; }
PImpl<class SimpleConfigTransactionImpl> impl;
public:
SimpleConfigTransaction(ConfigTransactionInterface const&);

View File

@ -21,6 +21,14 @@
#include "boost/lexical_cast.hpp"
#include "boost/algorithm/string.hpp"
#include <time.h>
#include <msgpack.hpp>
#include <exception>
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ProcessInterface.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "flow/Arena.h"
@ -67,7 +75,12 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
{ SpecialKeySpace::MODULE::GLOBALCONFIG,
KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) },
{ SpecialKeySpace::MODULE::TRACING,
KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
{ SpecialKeySpace::MODULE::ACTORLINEAGE,
KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) },
{ SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"),
LiteralStringRef("\xff\xff/actor_profiler_conf0")) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
@ -104,6 +117,15 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
{ "state",
KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0"))
.withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) },
{ "time",
KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0"))
.withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }
};
std::set<std::string> SpecialKeySpace::options = { "excluded/force",
"failed/force",
"excluded_locality/force",
@ -476,10 +498,10 @@ void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& r
auto begin = writeImpls[range.begin];
auto end = writeImpls.rangeContainingKeyBefore(range.end)->value();
if (begin != end) {
TraceEvent(SevDebug, "SpecialKeySpaceCrossModuleClear").detail("Range", range.toString());
TraceEvent(SevDebug, "SpecialKeySpaceCrossModuleClear").detail("Range", range);
throw special_keys_cross_module_clear(); // ban cross module clear
} else if (begin == nullptr) {
TraceEvent(SevDebug, "SpecialKeySpaceNoWriteModuleFound").detail("Range", range.toString());
TraceEvent(SevDebug, "SpecialKeySpaceNoWriteModuleFound").detail("Range", range);
throw special_keys_no_write_module_found();
}
return begin->clear(ryw, range);
@ -1918,6 +1940,300 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
"Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
}
ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
void parse(StringRef& val, int& i) {
i = std::stoi(val.toString());
}
void parse(StringRef& val, double& d) {
d = std::stod(val.toString());
}
void parse(StringRef& val, WaitState& w) {
if (val == LiteralStringRef("disk") || val == LiteralStringRef("Disk")) {
w = WaitState::Disk;
} else if (val == LiteralStringRef("network") || val == LiteralStringRef("Network")) {
w = WaitState::Network;
} else if (val == LiteralStringRef("running") || val == LiteralStringRef("Running")) {
w = WaitState::Running;
} else {
throw std::range_error("failed to parse run state");
}
}
void parse(StringRef& val, time_t& t) {
struct tm tm = { 0 };
#ifdef _WIN32
std::istringstream s(val.toString());
s.imbue(std::locale(setlocale(LC_TIME, nullptr)));
s >> std::get_time(&tm, "%FT%T%z");
if (s.fail()) {
throw std::invalid_argument("failed to parse ISO 8601 datetime");
}
long timezone;
if (_get_timezone(&timezone) != 0) {
throw std::runtime_error("failed to convert ISO 8601 datetime");
}
timezone = -timezone;
#else
if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
throw std::invalid_argument("failed to parse ISO 8601 datetime");
}
long timezone = tm.tm_gmtoff;
t = timegm(&tm);
if (t == -1) {
throw std::runtime_error("failed to convert ISO 8601 datetime");
}
t -= timezone;
#endif
}
void parse(StringRef& val, NetworkAddress& a) {
auto address = NetworkAddress::parse(val.toString());
if (!address.isValid()) {
throw std::invalid_argument("invalid host");
}
a = address;
}
// Base case function for parsing function below.
template <typename T>
void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1) {
if (it == end) {
return;
}
parse(*it, t1);
}
// Given an iterator into a vector of string tokens, an iterator to the end of
// the search space in the vector (exclusive), and a list of references to
// types, parses each token in the vector into the associated type according to
// the order of the arguments.
//
// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the
// argument list int a, double b, NetworkAddress c, after this function returns
// each parameter passed in will hold the parsed value from the token list.
//
// The appropriate parsing function must be implemented for the type you wish
// to parse. See the existing parsing functions above, and add your own if
// necessary.
template <typename T, typename... Types>
void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1, Types&... remaining) {
// Return as soon as all tokens have been parsed. This allows parameters
// passed at the end to act as optional parameters -- they will only be set
// if the value exists.
if (it == end) {
return;
}
try {
parse(*it, t1);
parse(++it, end, remaining...);
} catch (Error& e) {
throw e;
} catch (std::exception& e) {
throw e;
}
}
ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state RangeResult result;
// Set default values for all fields. The default will be used if the field
// is missing in the key.
state NetworkAddress host;
state WaitState waitStateStart = WaitState{ 0 };
state WaitState waitStateEnd = WaitState{ 2 };
state time_t timeStart = 0;
state time_t timeEnd = std::numeric_limits<time_t>::max();
state int seqStart = 0;
state int seqEnd = std::numeric_limits<int>::max();
state std::vector<StringRef> beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr);
state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
// Require index (either "state" or "time") and address:port.
if (beginValues.size() < 2 || endValues.size() < 2) {
ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
throw special_keys_api_failure();
}
state NetworkAddress endRangeHost;
try {
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);
if (kr.begin != kr.end) {
parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd);
}
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
// For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq
parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart);
if (kr.begin != kr.end) {
parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
}
} else {
ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
throw special_keys_api_failure();
}
} catch (Error& e) {
if (e.code() != special_keys_api_failure().code()) {
ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
throw special_keys_api_failure();
} else {
throw e;
}
}
if (kr.begin != kr.end && host != endRangeHost) {
// The client doesn't know about all the hosts, so a get range covering
// multiple hosts has no way of knowing which IP:port combos to use.
ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
throw special_keys_api_failure();
}
// Open endpoint to target process on each call. This can be optimized at
// some point...
state ProcessInterface process;
process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ host }, WLTOKEN_PROCESS));
ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
process = p;
ActorLineageRequest actorLineageRequest;
actorLineageRequest.waitStateStart = waitStateStart;
actorLineageRequest.waitStateEnd = waitStateEnd;
actorLineageRequest.timeStart = timeStart;
actorLineageRequest.timeEnd = timeEnd;
ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
time_t dt = 0;
int seq = -1;
for (const auto& sample : reply.samples) {
time_t datetime = (time_t)sample.time;
char buf[50];
struct tm* tm;
tm = localtime(&datetime);
size_t size = strftime(buf, 50, "%FT%T%z", tm);
std::string date(buf, size);
seq = dt == datetime ? seq + 1 : 0;
dt = datetime;
for (const auto& [waitState, data] : sample.data) {
if (seq < seqStart) { continue; }
else if (seq >= seqEnd) { break; }
std::ostringstream streamKey;
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
<< "/" << to_string(waitState) << "/" << date;
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
<< "/" << date << "/" << to_string(waitState);
} else {
ASSERT(false);
}
streamKey << "/" << seq;
msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
msgpack::object deserialized = oh.get();
std::ostringstream stream;
stream << deserialized;
result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
}
if (sample.data.size() == 0) {
std::ostringstream streamKey;
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
<< "/Running/" << date;
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
<< "/" << date << "/Running";
} else {
ASSERT(false);
}
streamKey << "/" << seq;
result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), "{}"_sr));
}
}
return result;
}
Future<RangeResult> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
}
namespace {
std::string_view to_string_view(StringRef sr) {
return std::string_view(reinterpret_cast<const char*>(sr.begin()), sr.size());
}
} // namespace
ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
: SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
Future<RangeResult> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
RangeResult res;
std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
end(to_string_view(kr.end.removePrefix(range.begin)));
for (auto& p : config) {
if (p.first > end) {
break;
} else if (p.first > begin) {
KeyValueRef kv;
kv.key = StringRef(res.arena(), p.first).withPrefix(kr.begin, res.arena());
kv.value = StringRef(res.arena(), p.second);
res.push_back(res.arena(), kv);
}
}
return res;
}
void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
config[key.removePrefix(range.begin).toString()] = value.toString();
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
didWrite = true;
}
void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
std::string begin(kr.begin.removePrefix(range.begin).toString()), end(kr.end.removePrefix(range.begin).toString());
auto first = config.lower_bound(begin);
if (first == config.end()) {
// nothing to clear
return;
}
didWrite = true;
auto last = config.upper_bound(end);
config.erase(first, last);
}
void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
std::string k = key.removePrefix(range.begin).toString();
auto iter = config.find(k);
if (iter != config.end()) {
config.erase(iter);
}
didWrite = true;
}
Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {
Optional<std::string> res{};
try {
if (didWrite) {
ProfilerConfig::instance().reset(config);
}
return res;
} catch (ConfigError& err) {
return Optional<std::string>{ err.description };
}
}
MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Used to read the healthZoneKey

View File

@ -140,6 +140,8 @@ public:
class SpecialKeySpace {
public:
enum class MODULE {
ACTORLINEAGE, // Sampling data
ACTOR_PROFILER_CONF, // profiler configuration
CLUSTERFILEPATH,
CONFIGURATION, // Configuration of the cluster
CONNECTIONSTRING,
@ -197,6 +199,12 @@ public:
static KeyRef getManagementApiCommandPrefix(const std::string& command) {
return managementApiCommandToRange.at(command).begin;
}
static KeyRangeRef getActorLineageApiCommandRange(const std::string& command) {
return actorLineageApiCommandToRange.at(command);
}
static KeyRef getActorLineageApiCommandPrefix(const std::string& command) {
return actorLineageApiCommandToRange.at(command).begin;
}
static Key getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option);
static const std::set<std::string>& getManagementApiOptionsSet() { return options; }
static const std::set<std::string>& getTracingOptions() { return tracingOptions; }
@ -225,6 +233,7 @@ private:
static std::unordered_map<SpecialKeySpace::MODULE, KeyRange> moduleToBoundary;
static std::unordered_map<std::string, KeyRange>
managementApiCommandToRange; // management command to its special keys' range
static std::unordered_map<std::string, KeyRange> actorLineageApiCommandToRange;
static std::set<std::string> options; // "<command>/<option>"
static std::set<std::string> tracingOptions;
@ -408,12 +417,32 @@ public:
void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
};
class ActorLineageImpl : public SpecialKeyRangeReadImpl {
public:
explicit ActorLineageImpl(KeyRangeRef kr);
Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
};
class ActorProfilerConf : public SpecialKeyRangeRWImpl {
bool didWrite = false;
std::map<std::string, std::string> config;
public:
explicit ActorProfilerConf(KeyRangeRef kr);
Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class MaintenanceImpl : public SpecialKeyRangeRWImpl {
public:
explicit MaintenanceImpl(KeyRangeRef kr);
Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class DataDistributionImpl : public SpecialKeyRangeRWImpl {
public:
explicit DataDistributionImpl(KeyRangeRef kr);

View File

@ -0,0 +1,29 @@
/*
* StackLineage.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/StackLineage.h"
std::vector<StringRef> getActorStackTrace() {
return (*currentLineage)->stack(&StackLineage::actorName);
}
namespace {
// StackLineageCollector stackLineageCollector;
}

41
fdbclient/StackLineage.h Normal file
View File

@ -0,0 +1,41 @@
/*
* StackLineage.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string_view>
#include "flow/flow.h"
#include "fdbclient/ActorLineageProfiler.h"
extern std::vector<StringRef> getActorStackTrace();
struct StackLineageCollector : IALPCollector<StackLineage> {
StackLineageCollector() : IALPCollector() {}
std::optional<std::any> collect(ActorLineage* lineage) override {
auto vec = lineage->stack(&StackLineage::actorName);
std::vector<std::string_view> res;
for (const auto& str : vec) {
res.push_back(std::string_view(reinterpret_cast<const char*>(str.begin()), str.size()));
}
return res;
}
};

View File

@ -210,6 +210,66 @@ void TSS_traceMismatch(TraceEvent& event,
ASSERT(false);
}
template <>
bool TSS_doCompare(const SplitMetricsReply& src, const SplitMetricsReply& tss) {
// We duplicate split metrics just for load, no need to validate replies.
return true;
}
template <>
const char* TSS_mismatchTraceName(const SplitMetricsRequest& req) {
ASSERT(false);
return "";
}
template <>
void TSS_traceMismatch(TraceEvent& event,
const SplitMetricsRequest& req,
const SplitMetricsReply& src,
const SplitMetricsReply& tss) {
ASSERT(false);
}
template <>
bool TSS_doCompare(const ReadHotSubRangeReply& src, const ReadHotSubRangeReply& tss) {
// We duplicate read hot sub range metrics just for load, no need to validate replies.
return true;
}
template <>
const char* TSS_mismatchTraceName(const ReadHotSubRangeRequest& req) {
ASSERT(false);
return "";
}
template <>
void TSS_traceMismatch(TraceEvent& event,
const ReadHotSubRangeRequest& req,
const ReadHotSubRangeReply& src,
const ReadHotSubRangeReply& tss) {
ASSERT(false);
}
template <>
bool TSS_doCompare(const SplitRangeReply& src, const SplitRangeReply& tss) {
// We duplicate read hot sub range metrics just for load, no need to validate replies.
return true;
}
template <>
const char* TSS_mismatchTraceName(const SplitRangeRequest& req) {
ASSERT(false);
return "";
}
template <>
void TSS_traceMismatch(TraceEvent& event,
const SplitRangeRequest& req,
const SplitRangeReply& src,
const SplitRangeReply& tss) {
ASSERT(false);
}
// template specializations for metrics replies that should never be called because these requests aren't duplicated
// storage metrics
@ -233,69 +293,6 @@ void TSS_traceMismatch(TraceEvent& event,
ASSERT(false);
}
// split metrics
template <>
bool TSS_doCompare(const SplitMetricsReply& src, const SplitMetricsReply& tss) {
ASSERT(false);
return true;
}
template <>
const char* TSS_mismatchTraceName(const SplitMetricsRequest& req) {
ASSERT(false);
return "";
}
template <>
void TSS_traceMismatch(TraceEvent& event,
const SplitMetricsRequest& req,
const SplitMetricsReply& src,
const SplitMetricsReply& tss) {
ASSERT(false);
}
// read hot sub range
template <>
bool TSS_doCompare(const ReadHotSubRangeReply& src, const ReadHotSubRangeReply& tss) {
ASSERT(false);
return true;
}
template <>
const char* TSS_mismatchTraceName(const ReadHotSubRangeRequest& req) {
ASSERT(false);
return "";
}
template <>
void TSS_traceMismatch(TraceEvent& event,
const ReadHotSubRangeRequest& req,
const ReadHotSubRangeReply& src,
const ReadHotSubRangeReply& tss) {
ASSERT(false);
}
// split range
template <>
bool TSS_doCompare(const SplitRangeReply& src, const SplitRangeReply& tss) {
ASSERT(false);
return true;
}
template <>
const char* TSS_mismatchTraceName(const SplitRangeRequest& req) {
ASSERT(false);
return "";
}
template <>
void TSS_traceMismatch(TraceEvent& event,
const SplitRangeRequest& req,
const SplitRangeReply& src,
const SplitRangeReply& tss) {
ASSERT(false);
}
// only record metrics for data reads
template <>

View File

@ -30,7 +30,7 @@
#include "fdbrpc/Stats.h"
#include "fdbrpc/TimedRequest.h"
#include "fdbrpc/TSSComparison.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/TagThrottle.actor.h"
#include "flow/UnitTest.h"
// Dead code, removed in the next protocol version

View File

@ -18,7 +18,7 @@
* limitations under the License.
*/
#include "fdbclient/TagThrottle.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/DatabaseContext.h"
@ -110,319 +110,3 @@ TagThrottleValue TagThrottleValue::fromValue(const ValueRef& value) {
reader >> throttleValue;
return throttleValue;
}
namespace ThrottleApi {
ACTOR Future<bool> getValidAutoEnabled(Transaction* tr, Database db) {
state bool result;
loop {
Optional<Value> value = wait(tr->get(tagThrottleAutoEnabledKey));
if (!value.present()) {
tr->reset();
wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
continue;
} else if (value.get() == LiteralStringRef("1")) {
result = true;
} else if (value.get() == LiteralStringRef("0")) {
result = false;
} else {
TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", db->dbId).detail("Value", value.get());
tr->reset();
wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
continue;
}
return result;
};
}
void signalThrottleChange(Transaction& tr) {
tr.atomicOp(
tagThrottleSignalKey, LiteralStringRef("XXXXXXXXXX\x00\x00\x00\x00"), MutationRef::SetVersionstampedValue);
}
ACTOR Future<Void> updateThrottleCount(Transaction* tr, int64_t delta) {
state Future<Optional<Value>> countVal = tr->get(tagThrottleCountKey);
state Future<Optional<Value>> limitVal = tr->get(tagThrottleLimitKey);
wait(success(countVal) && success(limitVal));
int64_t count = 0;
int64_t limit = 0;
if (countVal.get().present()) {
BinaryReader reader(countVal.get().get(), Unversioned());
reader >> count;
}
if (limitVal.get().present()) {
BinaryReader reader(limitVal.get().get(), Unversioned());
reader >> limit;
}
count += delta;
if (count > limit) {
throw too_many_tag_throttles();
}
BinaryWriter writer(Unversioned());
writer << count;
tr->set(tagThrottleCountKey, writer.toValue());
return Void();
}
ACTOR Future<std::vector<TagThrottleInfo>> getThrottledTags(Database db, int limit, bool containsRecommend) {
state Transaction tr(db);
state bool reportAuto = containsRecommend;
loop {
try {
if (!containsRecommend) {
wait(store(reportAuto, getValidAutoEnabled(&tr, db)));
}
RangeResult throttles = wait(tr.getRange(
reportAuto ? tagThrottleKeys : KeyRangeRef(tagThrottleKeysPrefix, tagThrottleAutoKeysPrefix), limit));
std::vector<TagThrottleInfo> results;
for (auto throttle : throttles) {
results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key),
TagThrottleValue::fromValue(throttle.value)));
}
return results;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<std::vector<TagThrottleInfo>> getRecommendedTags(Database db, int limit) {
state Transaction tr(db);
loop {
try {
bool enableAuto = wait(getValidAutoEnabled(&tr, db));
if (enableAuto) {
return std::vector<TagThrottleInfo>();
}
RangeResult throttles =
wait(tr.getRange(KeyRangeRef(tagThrottleAutoKeysPrefix, tagThrottleKeys.end), limit));
std::vector<TagThrottleInfo> results;
for (auto throttle : throttles) {
results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key),
TagThrottleValue::fromValue(throttle.value)));
}
return results;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> throttleTags(Database db,
TagSet tags,
double tpsRate,
double initialDuration,
TagThrottleType throttleType,
TransactionPriority priority,
Optional<double> expirationTime,
Optional<TagThrottledReason> reason) {
state Transaction tr(db);
state Key key = TagThrottleKey(tags, throttleType, priority).toKey();
ASSERT(initialDuration > 0);
if (throttleType == TagThrottleType::MANUAL) {
reason = TagThrottledReason::MANUAL;
}
TagThrottleValue throttle(tpsRate,
expirationTime.present() ? expirationTime.get() : 0,
initialDuration,
reason.present() ? reason.get() : TagThrottledReason::UNSET);
BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
wr << throttle;
state Value value = wr.toValue();
loop {
try {
if (throttleType == TagThrottleType::MANUAL) {
Optional<Value> oldThrottle = wait(tr.get(key));
if (!oldThrottle.present()) {
wait(updateThrottleCount(&tr, 1));
}
}
tr.set(key, value);
if (throttleType == TagThrottleType::MANUAL) {
signalThrottleChange(tr);
}
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<bool> unthrottleTags(Database db,
TagSet tags,
Optional<TagThrottleType> throttleType,
Optional<TransactionPriority> priority) {
state Transaction tr(db);
state std::vector<Key> keys;
for (auto p : allTransactionPriorities) {
if (!priority.present() || priority.get() == p) {
if (!throttleType.present() || throttleType.get() == TagThrottleType::AUTO) {
keys.push_back(TagThrottleKey(tags, TagThrottleType::AUTO, p).toKey());
}
if (!throttleType.present() || throttleType.get() == TagThrottleType::MANUAL) {
keys.push_back(TagThrottleKey(tags, TagThrottleType::MANUAL, p).toKey());
}
}
}
state bool removed = false;
loop {
try {
state std::vector<Future<Optional<Value>>> values;
values.reserve(keys.size());
for (auto key : keys) {
values.push_back(tr.get(key));
}
wait(waitForAll(values));
int delta = 0;
for (int i = 0; i < values.size(); ++i) {
if (values[i].get().present()) {
if (TagThrottleKey::fromKey(keys[i]).throttleType == TagThrottleType::MANUAL) {
delta -= 1;
}
tr.clear(keys[i]);
// Report that we are removing this tag if we ever see it present.
// This protects us from getting confused if the transaction is maybe committed.
// It's ok if someone else actually ends up removing this tag at the same time
// and we aren't the ones to actually do it.
removed = true;
}
}
if (delta != 0) {
wait(updateThrottleCount(&tr, delta));
}
if (removed) {
signalThrottleChange(tr);
wait(tr.commit());
}
return removed;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<bool> unthrottleMatchingThrottles(Database db,
KeyRef beginKey,
KeyRef endKey,
Optional<TransactionPriority> priority,
bool onlyExpiredThrottles) {
state Transaction tr(db);
state KeySelector begin = firstGreaterOrEqual(beginKey);
state KeySelector end = firstGreaterOrEqual(endKey);
state bool removed = false;
loop {
try {
state RangeResult tags = wait(tr.getRange(begin, end, 1000));
state uint64_t unthrottledTags = 0;
uint64_t manualUnthrottledTags = 0;
for (auto tag : tags) {
if (onlyExpiredThrottles) {
double expirationTime = TagThrottleValue::fromValue(tag.value).expirationTime;
if (expirationTime == 0 || expirationTime > now()) {
continue;
}
}
TagThrottleKey key = TagThrottleKey::fromKey(tag.key);
if (priority.present() && key.priority != priority.get()) {
continue;
}
if (key.throttleType == TagThrottleType::MANUAL) {
++manualUnthrottledTags;
}
removed = true;
tr.clear(tag.key);
unthrottledTags++;
}
if (manualUnthrottledTags > 0) {
wait(updateThrottleCount(&tr, -manualUnthrottledTags));
}
if (unthrottledTags > 0) {
signalThrottleChange(tr);
}
wait(tr.commit());
if (!tags.more) {
return removed;
}
ASSERT(tags.size() > 0);
begin = KeySelector(firstGreaterThan(tags[tags.size() - 1].key), tags.arena());
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
Future<bool> unthrottleAll(Database db,
Optional<TagThrottleType> tagThrottleType,
Optional<TransactionPriority> priority) {
KeyRef begin = tagThrottleKeys.begin;
KeyRef end = tagThrottleKeys.end;
if (tagThrottleType.present() && tagThrottleType == TagThrottleType::AUTO) {
begin = tagThrottleAutoKeysPrefix;
} else if (tagThrottleType.present() && tagThrottleType == TagThrottleType::MANUAL) {
end = tagThrottleAutoKeysPrefix;
}
return unthrottleMatchingThrottles(db, begin, end, priority, false);
}
Future<bool> expire(Database db) {
return unthrottleMatchingThrottles(
db, tagThrottleKeys.begin, tagThrottleKeys.end, Optional<TransactionPriority>(), true);
}
ACTOR Future<Void> enableAuto(Database db, bool enabled) {
state Transaction tr(db);
loop {
try {
Optional<Value> value = wait(tr.get(tagThrottleAutoEnabledKey));
if (!value.present() || (enabled && value.get() != LiteralStringRef("1")) ||
(!enabled && value.get() != LiteralStringRef("0"))) {
tr.set(tagThrottleAutoEnabledKey, LiteralStringRef(enabled ? "1" : "0"));
signalThrottleChange(tr);
wait(tr.commit());
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
} // namespace ThrottleApi

View File

@ -0,0 +1,592 @@
/*
* TagThrottle.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TAG_THROTTLE_ACTOR_G_H)
#define FDBCLIENT_TAG_THROTTLE_ACTOR_G_H
#include "fdbclient/TagThrottle.actor.g.h"
#elif !defined(FDBCLIENT_TAG_THROTTLE_ACTOR_H)
#define FDBCLIENT_TAG_THROTTLE_ACTOR_H
#pragma once
#include "flow/Error.h"
#include "flow/flow.h"
#include "flow/network.h"
#include "flow/ThreadHelper.actor.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/CommitTransaction.h"
#include "flow/actorcompiler.h" // This must be the last #include.
typedef StringRef TransactionTagRef;
typedef Standalone<TransactionTagRef> TransactionTag;
class TagSet {
public:
typedef std::vector<TransactionTagRef>::const_iterator const_iterator;
TagSet() : bytes(0) {}
void addTag(TransactionTagRef tag);
size_t size() const;
const_iterator begin() const { return tags.begin(); }
const_iterator end() const { return tags.end(); }
void clear() {
tags.clear();
bytes = 0;
}
template <class Context>
void save(uint8_t* out, Context& c) const {
uint8_t* start = out;
for (const auto& tag : *this) {
*(out++) = (uint8_t)tag.size();
std::copy(tag.begin(), tag.end(), out);
out += tag.size();
}
ASSERT((size_t)(out - start) == size() + bytes);
}
template <class Context>
void load(const uint8_t* data, size_t size, Context& context) {
// const uint8_t *start = data;
const uint8_t* end = data + size;
while (data < end) {
uint8_t len = *(data++);
// Tags are already deduplicated
const auto& tag = tags.emplace_back(context.tryReadZeroCopy(data, len), len);
data += len;
bytes += tag.size();
}
ASSERT(data == end);
// Deserialized tag sets share the arena with the request that contained them
// For this reason, persisting a TagSet that shares memory with other request
// members should be done with caution.
arena = context.arena();
}
size_t getBytes() const { return bytes; }
const Arena& getArena() const { return arena; }
private:
size_t bytes;
Arena arena;
// Currently there are never >= 256 tags, so
// std::vector is faster than std::set. This may
// change if we allow more tags in the future.
std::vector<TransactionTagRef> tags;
};
template <>
struct dynamic_size_traits<TagSet> : std::true_type {
// May be called multiple times during one serialization
template <class Context>
static size_t size(const TagSet& t, Context&) {
return t.size() + t.getBytes();
}
// Guaranteed to be called only once during serialization
template <class Context>
static void save(uint8_t* out, const TagSet& t, Context& c) {
t.save(out, c);
}
// Context is an arbitrary type that is plumbed by reference throughout the
// load call tree.
template <class Context>
static void load(const uint8_t* data, size_t size, TagSet& t, Context& context) {
t.load(data, size, context);
}
};
enum class TagThrottleType : uint8_t { MANUAL, AUTO };
enum class TagThrottledReason : uint8_t { UNSET = 0, MANUAL, BUSY_READ, BUSY_WRITE };
struct TagThrottleKey {
TagSet tags;
TagThrottleType throttleType;
TransactionPriority priority;
TagThrottleKey() : throttleType(TagThrottleType::MANUAL), priority(TransactionPriority::DEFAULT) {}
TagThrottleKey(TagSet tags, TagThrottleType throttleType, TransactionPriority priority)
: tags(tags), throttleType(throttleType), priority(priority) {}
Key toKey() const;
static TagThrottleKey fromKey(const KeyRef& key);
};
struct TagThrottleValue {
double tpsRate;
double expirationTime;
double initialDuration;
TagThrottledReason reason;
TagThrottleValue() : tpsRate(0), expirationTime(0), initialDuration(0), reason(TagThrottledReason::UNSET) {}
TagThrottleValue(double tpsRate, double expirationTime, double initialDuration, TagThrottledReason reason)
: tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration), reason(reason) {}
static TagThrottleValue fromValue(const ValueRef& value);
// To change this serialization, ProtocolVersion::TagThrottleValue must be updated, and downgrades need to be
// considered
template <class Ar>
void serialize(Ar& ar) {
if (ar.protocolVersion().hasTagThrottleValueReason()) {
serializer(ar, tpsRate, expirationTime, initialDuration, reason);
} else if (ar.protocolVersion().hasTagThrottleValue()) {
serializer(ar, tpsRate, expirationTime, initialDuration);
if (ar.isDeserializing) {
reason = TagThrottledReason::UNSET;
}
}
}
};
struct TagThrottleInfo {
TransactionTag tag;
TagThrottleType throttleType;
TransactionPriority priority;
double tpsRate;
double expirationTime;
double initialDuration;
TagThrottledReason reason;
TagThrottleInfo(TransactionTag tag,
TagThrottleType throttleType,
TransactionPriority priority,
double tpsRate,
double expirationTime,
double initialDuration,
TagThrottledReason reason = TagThrottledReason::UNSET)
: tag(tag), throttleType(throttleType), priority(priority), tpsRate(tpsRate), expirationTime(expirationTime),
initialDuration(initialDuration), reason(reason) {}
TagThrottleInfo(TagThrottleKey key, TagThrottleValue value)
: throttleType(key.throttleType), priority(key.priority), tpsRate(value.tpsRate),
expirationTime(value.expirationTime), initialDuration(value.initialDuration), reason(value.reason) {
ASSERT(key.tags.size() == 1); // Multiple tags per throttle is not currently supported
tag = *key.tags.begin();
}
};
struct ClientTagThrottleLimits {
double tpsRate;
double expiration;
ClientTagThrottleLimits() : tpsRate(0), expiration(0) {}
ClientTagThrottleLimits(double tpsRate, double expiration) : tpsRate(tpsRate), expiration(expiration) {}
template <class Archive>
void serialize(Archive& ar) {
// Convert expiration time to a duration to avoid clock differences
double duration = 0;
if (!ar.isDeserializing) {
duration = expiration - now();
}
serializer(ar, tpsRate, duration);
if (ar.isDeserializing) {
expiration = now() + duration;
}
}
};
struct ClientTrCommitCostEstimation {
int opsCount = 0;
uint64_t writeCosts = 0;
std::deque<std::pair<int, uint64_t>> clearIdxCosts;
uint32_t expensiveCostEstCount = 0;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, opsCount, writeCosts, clearIdxCosts, expensiveCostEstCount);
}
};
// Keys to view and control tag throttling
extern const KeyRangeRef tagThrottleKeys;
extern const KeyRef tagThrottleKeysPrefix;
extern const KeyRef tagThrottleAutoKeysPrefix;
extern const KeyRef tagThrottleSignalKey;
extern const KeyRef tagThrottleAutoEnabledKey;
extern const KeyRef tagThrottleLimitKey;
extern const KeyRef tagThrottleCountKey;
namespace ThrottleApi {
// The template functions can be called with Native API like DatabaseContext, Transaction/ReadYourWritesTransaction
// or using IClientAPI like IDatabase, ITransaction
ACTOR template <class Tr>
Future<bool> getValidAutoEnabled(Reference<Tr> tr) {
state bool result;
loop {
Optional<Value> value = wait(safeThreadFutureToFuture(tr->get(tagThrottleAutoEnabledKey)));
if (!value.present()) {
tr->reset();
wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
continue;
} else if (value.get() == LiteralStringRef("1")) {
result = true;
} else if (value.get() == LiteralStringRef("0")) {
result = false;
} else {
TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue").detail("Value", value.get());
tr->reset();
wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF));
continue;
}
return result;
};
}
ACTOR template <class DB>
Future<std::vector<TagThrottleInfo>> getRecommendedTags(Reference<DB> db, int limit) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
bool enableAuto = wait(getValidAutoEnabled(tr));
if (enableAuto) {
return std::vector<TagThrottleInfo>();
}
state typename DB::TransactionT::template FutureT<RangeResult> f =
tr->getRange(KeyRangeRef(tagThrottleAutoKeysPrefix, tagThrottleKeys.end), limit);
RangeResult throttles = wait(safeThreadFutureToFuture(f));
std::vector<TagThrottleInfo> results;
for (auto throttle : throttles) {
results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key),
TagThrottleValue::fromValue(throttle.value)));
}
return results;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR template <class DB>
Future<std::vector<TagThrottleInfo>> getThrottledTags(Reference<DB> db, int limit, bool containsRecommend = false) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state bool reportAuto = containsRecommend;
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
if (!containsRecommend) {
wait(store(reportAuto, getValidAutoEnabled(tr)));
}
state typename DB::TransactionT::template FutureT<RangeResult> f = tr->getRange(
reportAuto ? tagThrottleKeys : KeyRangeRef(tagThrottleKeysPrefix, tagThrottleAutoKeysPrefix), limit);
RangeResult throttles = wait(safeThreadFutureToFuture(f));
std::vector<TagThrottleInfo> results;
for (auto throttle : throttles) {
results.push_back(TagThrottleInfo(TagThrottleKey::fromKey(throttle.key),
TagThrottleValue::fromValue(throttle.value)));
}
return results;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
template <class Tr>
void signalThrottleChange(Reference<Tr> tr) {
tr->atomicOp(
tagThrottleSignalKey, LiteralStringRef("XXXXXXXXXX\x00\x00\x00\x00"), MutationRef::SetVersionstampedValue);
}
ACTOR template <class Tr>
Future<Void> updateThrottleCount(Reference<Tr> tr, int64_t delta) {
state typename Tr::template FutureT<Optional<Value>> countVal = tr->get(tagThrottleCountKey);
state typename Tr::template FutureT<Optional<Value>> limitVal = tr->get(tagThrottleLimitKey);
wait(success(safeThreadFutureToFuture(countVal)) && success(safeThreadFutureToFuture(limitVal)));
int64_t count = 0;
int64_t limit = 0;
if (countVal.get().present()) {
BinaryReader reader(countVal.get().get(), Unversioned());
reader >> count;
}
if (limitVal.get().present()) {
BinaryReader reader(limitVal.get().get(), Unversioned());
reader >> limit;
}
count += delta;
if (count > limit) {
throw too_many_tag_throttles();
}
BinaryWriter writer(Unversioned());
writer << count;
tr->set(tagThrottleCountKey, writer.toValue());
return Void();
}
ACTOR template <class DB>
Future<bool> unthrottleMatchingThrottles(Reference<DB> db,
KeyRef beginKey,
KeyRef endKey,
Optional<TransactionPriority> priority,
bool onlyExpiredThrottles) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state KeySelector begin = firstGreaterOrEqual(beginKey);
state KeySelector end = firstGreaterOrEqual(endKey);
state bool removed = false;
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
// holds memory of the RangeResult
state typename DB::TransactionT::template FutureT<RangeResult> f = tr->getRange(begin, end, 1000);
state RangeResult tags = wait(safeThreadFutureToFuture(f));
state uint64_t unthrottledTags = 0;
uint64_t manualUnthrottledTags = 0;
for (auto tag : tags) {
if (onlyExpiredThrottles) {
double expirationTime = TagThrottleValue::fromValue(tag.value).expirationTime;
if (expirationTime == 0 || expirationTime > now()) {
continue;
}
}
TagThrottleKey key = TagThrottleKey::fromKey(tag.key);
if (priority.present() && key.priority != priority.get()) {
continue;
}
if (key.throttleType == TagThrottleType::MANUAL) {
++manualUnthrottledTags;
}
removed = true;
tr->clear(tag.key);
unthrottledTags++;
}
if (manualUnthrottledTags > 0) {
wait(updateThrottleCount(tr, -manualUnthrottledTags));
}
if (unthrottledTags > 0) {
signalThrottleChange(tr);
}
wait(safeThreadFutureToFuture(tr->commit()));
if (!tags.more) {
return removed;
}
ASSERT(tags.size() > 0);
begin = KeySelector(firstGreaterThan(tags[tags.size() - 1].key), tags.arena());
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
template <class DB>
Future<bool> expire(DB db) {
return unthrottleMatchingThrottles(
db, tagThrottleKeys.begin, tagThrottleKeys.end, Optional<TransactionPriority>(), true);
}
template <class DB>
Future<bool> unthrottleAll(Reference<DB> db,
Optional<TagThrottleType> tagThrottleType,
Optional<TransactionPriority> priority) {
KeyRef begin = tagThrottleKeys.begin;
KeyRef end = tagThrottleKeys.end;
if (tagThrottleType.present() && tagThrottleType == TagThrottleType::AUTO) {
begin = tagThrottleAutoKeysPrefix;
} else if (tagThrottleType.present() && tagThrottleType == TagThrottleType::MANUAL) {
end = tagThrottleAutoKeysPrefix;
}
return unthrottleMatchingThrottles(db, begin, end, priority, false);
}
ACTOR template <class DB>
Future<bool> unthrottleTags(Reference<DB> db,
TagSet tags,
Optional<TagThrottleType> throttleType,
Optional<TransactionPriority> priority) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state std::vector<Key> keys;
for (auto p : allTransactionPriorities) {
if (!priority.present() || priority.get() == p) {
if (!throttleType.present() || throttleType.get() == TagThrottleType::AUTO) {
keys.push_back(TagThrottleKey(tags, TagThrottleType::AUTO, p).toKey());
}
if (!throttleType.present() || throttleType.get() == TagThrottleType::MANUAL) {
keys.push_back(TagThrottleKey(tags, TagThrottleType::MANUAL, p).toKey());
}
}
}
state bool removed = false;
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state std::vector<Future<Optional<Value>>> values;
values.reserve(keys.size());
for (auto key : keys) {
values.push_back(safeThreadFutureToFuture(tr->get(key)));
}
wait(waitForAll(values));
int delta = 0;
for (int i = 0; i < values.size(); ++i) {
if (values[i].get().present()) {
if (TagThrottleKey::fromKey(keys[i]).throttleType == TagThrottleType::MANUAL) {
delta -= 1;
}
tr->clear(keys[i]);
// Report that we are removing this tag if we ever see it present.
// This protects us from getting confused if the transaction is maybe committed.
// It's ok if someone else actually ends up removing this tag at the same time
// and we aren't the ones to actually do it.
removed = true;
}
}
if (delta != 0) {
wait(updateThrottleCount(tr, delta));
}
if (removed) {
signalThrottleChange(tr);
wait(safeThreadFutureToFuture(tr->commit()));
}
return removed;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR template <class DB>
Future<Void> throttleTags(Reference<DB> db,
TagSet tags,
double tpsRate,
double initialDuration,
TagThrottleType throttleType,
TransactionPriority priority,
Optional<double> expirationTime = Optional<double>(),
Optional<TagThrottledReason> reason = Optional<TagThrottledReason>()) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state Key key = TagThrottleKey(tags, throttleType, priority).toKey();
ASSERT(initialDuration > 0);
if (throttleType == TagThrottleType::MANUAL) {
reason = TagThrottledReason::MANUAL;
}
TagThrottleValue throttle(tpsRate,
expirationTime.present() ? expirationTime.get() : 0,
initialDuration,
reason.present() ? reason.get() : TagThrottledReason::UNSET);
BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
wr << throttle;
state Value value = wr.toValue();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
if (throttleType == TagThrottleType::MANUAL) {
Optional<Value> oldThrottle = wait(safeThreadFutureToFuture(tr->get(key)));
if (!oldThrottle.present()) {
wait(updateThrottleCount(tr, 1));
}
}
tr->set(key, value);
if (throttleType == TagThrottleType::MANUAL) {
signalThrottleChange(tr);
}
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR template <class DB>
Future<Void> enableAuto(Reference<DB> db, bool enabled) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
Optional<Value> value = wait(safeThreadFutureToFuture(tr->get(tagThrottleAutoEnabledKey)));
if (!value.present() || (enabled && value.get() != LiteralStringRef("1")) ||
(!enabled && value.get() != LiteralStringRef("0"))) {
tr->set(tagThrottleAutoEnabledKey, LiteralStringRef(enabled ? "1" : "0"));
signalThrottleChange<typename DB::TransactionT>(tr);
wait(safeThreadFutureToFuture(tr->commit()));
}
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
}; // namespace ThrottleApi
template <class Value>
using TransactionTagMap = std::unordered_map<TransactionTag, Value, std::hash<TransactionTagRef>>;
template <class Value>
using PrioritizedTransactionTagMap = std::map<TransactionPriority, TransactionTagMap<Value>>;
template <class Value>
using UIDTransactionTagMap = std::unordered_map<UID, TransactionTagMap<Value>>;
#include "flow/unactorcompiler.h"
#endif

View File

@ -1,265 +0,0 @@
/*
* TagThrottle.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_TAG_THROTTLE_H
#define FDBCLIENT_TAG_THROTTLE_H
#pragma once
#include "flow/Error.h"
#include "flow/flow.h"
#include "flow/network.h"
#include "fdbclient/FDBTypes.h"
#include <set>
class Database;
namespace ThrottleApi {}
typedef StringRef TransactionTagRef;
typedef Standalone<TransactionTagRef> TransactionTag;
class TagSet {
public:
typedef std::vector<TransactionTagRef>::const_iterator const_iterator;
TagSet() : bytes(0) {}
void addTag(TransactionTagRef tag);
size_t size() const;
const_iterator begin() const { return tags.begin(); }
const_iterator end() const { return tags.end(); }
void clear() {
tags.clear();
bytes = 0;
}
template <class Context>
void save(uint8_t* out, Context& c) const {
uint8_t* start = out;
for (const auto& tag : *this) {
*(out++) = (uint8_t)tag.size();
std::copy(tag.begin(), tag.end(), out);
out += tag.size();
}
ASSERT((size_t)(out - start) == size() + bytes);
}
template <class Context>
void load(const uint8_t* data, size_t size, Context& context) {
// const uint8_t *start = data;
const uint8_t* end = data + size;
while (data < end) {
uint8_t len = *(data++);
// Tags are already deduplicated
const auto& tag = tags.emplace_back(context.tryReadZeroCopy(data, len), len);
data += len;
bytes += tag.size();
}
ASSERT(data == end);
// Deserialized tag sets share the arena with the request that contained them
// For this reason, persisting a TagSet that shares memory with other request
// members should be done with caution.
arena = context.arena();
}
size_t getBytes() const { return bytes; }
const Arena& getArena() const { return arena; }
private:
size_t bytes;
Arena arena;
// Currently there are never >= 256 tags, so
// std::vector is faster than std::set. This may
// change if we allow more tags in the future.
std::vector<TransactionTagRef> tags;
};
template <>
struct dynamic_size_traits<TagSet> : std::true_type {
// May be called multiple times during one serialization
template <class Context>
static size_t size(const TagSet& t, Context&) {
return t.size() + t.getBytes();
}
// Guaranteed to be called only once during serialization
template <class Context>
static void save(uint8_t* out, const TagSet& t, Context& c) {
t.save(out, c);
}
// Context is an arbitrary type that is plumbed by reference throughout the
// load call tree.
template <class Context>
static void load(const uint8_t* data, size_t size, TagSet& t, Context& context) {
t.load(data, size, context);
}
};
enum class TagThrottleType : uint8_t { MANUAL, AUTO };
enum class TagThrottledReason : uint8_t { UNSET = 0, MANUAL, BUSY_READ, BUSY_WRITE };
struct TagThrottleKey {
TagSet tags;
TagThrottleType throttleType;
TransactionPriority priority;
TagThrottleKey() : throttleType(TagThrottleType::MANUAL), priority(TransactionPriority::DEFAULT) {}
TagThrottleKey(TagSet tags, TagThrottleType throttleType, TransactionPriority priority)
: tags(tags), throttleType(throttleType), priority(priority) {}
Key toKey() const;
static TagThrottleKey fromKey(const KeyRef& key);
};
struct TagThrottleValue {
double tpsRate;
double expirationTime;
double initialDuration;
TagThrottledReason reason;
TagThrottleValue() : tpsRate(0), expirationTime(0), initialDuration(0), reason(TagThrottledReason::UNSET) {}
TagThrottleValue(double tpsRate, double expirationTime, double initialDuration, TagThrottledReason reason)
: tpsRate(tpsRate), expirationTime(expirationTime), initialDuration(initialDuration), reason(reason) {}
static TagThrottleValue fromValue(const ValueRef& value);
// To change this serialization, ProtocolVersion::TagThrottleValue must be updated, and downgrades need to be
// considered
template <class Ar>
void serialize(Ar& ar) {
if (ar.protocolVersion().hasTagThrottleValueReason()) {
serializer(ar, tpsRate, expirationTime, initialDuration, reason);
} else if (ar.protocolVersion().hasTagThrottleValue()) {
serializer(ar, tpsRate, expirationTime, initialDuration);
if (ar.isDeserializing) {
reason = TagThrottledReason::UNSET;
}
}
}
};
struct TagThrottleInfo {
TransactionTag tag;
TagThrottleType throttleType;
TransactionPriority priority;
double tpsRate;
double expirationTime;
double initialDuration;
TagThrottledReason reason;
TagThrottleInfo(TransactionTag tag,
TagThrottleType throttleType,
TransactionPriority priority,
double tpsRate,
double expirationTime,
double initialDuration,
TagThrottledReason reason = TagThrottledReason::UNSET)
: tag(tag), throttleType(throttleType), priority(priority), tpsRate(tpsRate), expirationTime(expirationTime),
initialDuration(initialDuration), reason(reason) {}
TagThrottleInfo(TagThrottleKey key, TagThrottleValue value)
: throttleType(key.throttleType), priority(key.priority), tpsRate(value.tpsRate),
expirationTime(value.expirationTime), initialDuration(value.initialDuration), reason(value.reason) {
ASSERT(key.tags.size() == 1); // Multiple tags per throttle is not currently supported
tag = *key.tags.begin();
}
};
struct ClientTagThrottleLimits {
double tpsRate;
double expiration;
ClientTagThrottleLimits() : tpsRate(0), expiration(0) {}
ClientTagThrottleLimits(double tpsRate, double expiration) : tpsRate(tpsRate), expiration(expiration) {}
template <class Archive>
void serialize(Archive& ar) {
// Convert expiration time to a duration to avoid clock differences
double duration = 0;
if (!ar.isDeserializing) {
duration = expiration - now();
}
serializer(ar, tpsRate, duration);
if (ar.isDeserializing) {
expiration = now() + duration;
}
}
};
struct ClientTrCommitCostEstimation {
int opsCount = 0;
uint64_t writeCosts = 0;
std::deque<std::pair<int, uint64_t>> clearIdxCosts;
uint32_t expensiveCostEstCount = 0;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, opsCount, writeCosts, clearIdxCosts, expensiveCostEstCount);
}
};
namespace ThrottleApi {
Future<std::vector<TagThrottleInfo>> getThrottledTags(Database const& db,
int const& limit,
bool const& containsRecommend = false);
Future<std::vector<TagThrottleInfo>> getRecommendedTags(Database const& db, int const& limit);
Future<Void> throttleTags(Database const& db,
TagSet const& tags,
double const& tpsRate,
double const& initialDuration,
TagThrottleType const& throttleType,
TransactionPriority const& priority,
Optional<double> const& expirationTime = Optional<double>(),
Optional<TagThrottledReason> const& reason = Optional<TagThrottledReason>());
Future<bool> unthrottleTags(Database const& db,
TagSet const& tags,
Optional<TagThrottleType> const& throttleType,
Optional<TransactionPriority> const& priority);
Future<bool> unthrottleAll(Database db, Optional<TagThrottleType> throttleType, Optional<TransactionPriority> priority);
Future<bool> expire(Database db);
Future<Void> enableAuto(Database const& db, bool const& enabled);
}; // namespace ThrottleApi
template <class Value>
using TransactionTagMap = std::unordered_map<TransactionTag, Value, std::hash<TransactionTagRef>>;
template <class Value>
using PrioritizedTransactionTagMap = std::map<TransactionPriority, TransactionTagMap<Value>>;
template <class Value>
using UIDTransactionTagMap = std::unordered_map<UID, TransactionTagMap<Value>>;
#endif

View File

@ -0,0 +1,25 @@
/*
* TransactionLineage.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/TransactionLineage.h"
namespace {
TransactionLineageCollector transactionLineageCollector;
}

View File

@ -0,0 +1,130 @@
/*
* TransactionLineage.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/ActorLineageProfiler.h"
struct TransactionLineage : LineageProperties<TransactionLineage> {
enum class Operation {
Unset,
GetValue,
GetKey,
GetKeyValues,
WatchValue,
GetConsistentReadVersion,
Commit,
GetKeyServersLocations
};
static constexpr std::string_view name = "Transaction"sv;
uint64_t txID;
Operation operation = Operation::Unset;
bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; }
bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; }
};
struct TransactionLineageCollector : IALPCollector<TransactionLineage> {
using Operation = TransactionLineage::Operation;
std::optional<std::any> collect(ActorLineage* lineage) {
std::map<std::string_view, std::any> res;
auto txID = lineage->get(&TransactionLineage::txID);
if (txID.has_value()) {
res["ID"sv] = txID.value();
}
auto operation = lineage->get(&TransactionLineage::operation);
if (operation.has_value()) {
switch (operation.value()) {
case Operation::Unset:
res["operation"sv] = "Unset"sv;
break;
case Operation::GetValue:
res["operation"sv] = "GetValue"sv;
break;
case Operation::GetKey:
res["operation"sv] = "GetKey"sv;
break;
case Operation::GetKeyValues:
res["operation"sv] = "GetKeyValues"sv;
break;
case Operation::WatchValue:
res["operation"sv] = "WatchValue"sv;
break;
case Operation::GetConsistentReadVersion:
res["operation"sv] = "GetConsistentReadVersion"sv;
break;
case Operation::Commit:
res["operation"sv] = "Commit"sv;
break;
case Operation::GetKeyServersLocations:
res["operation"sv] = "GetKeyServersLocations"sv;
break;
}
}
if (res.empty()) {
return std::optional<std::any>{};
} else {
return res;
}
}
};
#ifdef ENABLE_SAMPLING
template <class T, class V>
class ScopedLineage {
V before;
V T::*member;
bool valid = true;
public:
ScopedLineage(V T::*member, V const& value) : member(member) {
auto& val = getCurrentLineage()->modify(member);
before = val;
val = value;
}
~ScopedLineage() {
if (!valid) {
return;
}
getCurrentLineage()->modify(member) = before;
}
ScopedLineage(ScopedLineage<T, V>&& o) : before(std::move(o.before)), member(o.member), valid(o.valid) {
o.release();
}
ScopedLineage& operator=(ScopedLineage<T, V>&& o) {
if (valid) {
getCurrentLineage()->modify(member) = before;
}
before = std::move(o.before);
member = o.member;
valid = o.valid;
o.release();
return *this;
}
ScopedLineage(const ScopedLineage<T, V>&) = delete;
ScopedLineage& operator=(const ScopedLineage<T, V>&) = delete;
void release() { valid = false; }
};
template <class T, class V>
ScopedLineage<T, V> make_scoped_lineage(V T::*member, V const& value) {
return ScopedLineage<T, V>(member, value);
}
#endif

View File

@ -244,8 +244,8 @@ public:
TraceEvent("AsyncFileCachedDel")
.detail("Filename", filename)
.detail("Refcount", debugGetReferenceCount())
.detail("CanDie", f.isReady())
.backtrace();
.detail("CanDie", f.isReady());
// .backtrace();
if (f.isReady())
delete this;
else

View File

@ -242,7 +242,12 @@ public:
// result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; });
#endif
return success(result);
// auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
// auto index = actorLineageSet.insert(*currentLineage);
// ASSERT(index != ActorLineageSet::npos);
Future<Void> res = success(result);
// actorLineageSet.erase(index);
return res;
}
// TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10
#ifndef FALLOC_FL_ZERO_RANGE

View File

@ -63,6 +63,14 @@ add_flow_target(STATIC_LIBRARY NAME fdbrpc
DISABLE_ACTOR_DIAGNOSTICS ${FDBRPC_SRCS_DISABLE_ACTOR_DIAGNOSTICS})
target_include_directories(fdbrpc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio)
target_link_libraries(fdbrpc PUBLIC flow)
add_flow_target(STATIC_LIBRARY NAME fdbrpc_sampling
SRCS ${FDBRPC_SRCS}
DISABLE_ACTOR_DIAGNOSTICS ${FDBRPC_SRCS_DISABLE_ACTOR_DIAGNOSTICS})
target_include_directories(fdbrpc_sampling PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/libeio)
target_link_libraries(fdbrpc_sampling PUBLIC flow_sampling)
target_compile_definitions(fdbrpc_sampling PRIVATE -DENABLE_SAMPLING)
if(COMPILE_EIO)
add_library(eio STATIC libeio/eio.c)
if(USE_VALGRIND)
@ -71,8 +79,10 @@ if(COMPILE_EIO)
target_compile_definitions(eio PRIVATE USE_UCONTEXT)
target_compile_options(eio BEFORE PRIVATE -w) # disable warnings for eio
target_link_libraries(fdbrpc PRIVATE eio)
target_link_libraries(fdbrpc_sampling PRIVATE eio)
endif()
if(WIN32)
add_library(coro STATIC libcoroutine/Common.c libcoroutine/Coro.c)
target_link_libraries(fdbrpc PRIVATE coro)
target_link_libraries(fdbrpc_sampling PRIVATE coro)
endif()

View File

@ -24,6 +24,7 @@
#include "flow/UnitTest.h"
#include "flow/DeterministicRandom.h"
#include "flow/IThreadPool.h"
#include "flow/WriteOnlySet.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/IAsyncFile.h"
#include "flow/TLSConfig.actor.h"
@ -285,6 +286,11 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
static TLSConfig emptyConfig;
return emptyConfig;
}
#ifdef ENABLE_SAMPLING
ActorLineageSet& getActorLineageSet() override {
throw std::exception();
}
#endif
ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
};

View File

@ -158,7 +158,10 @@ const Endpoint& EndpointMap::insert(NetworkAddressList localAddresses,
NetworkMessageReceiver* EndpointMap::get(Endpoint::Token const& token) {
uint32_t index = token.second();
if (index < wellKnownEndpointCount && data[index].receiver == nullptr) {
TraceEvent(SevWarnAlways, "WellKnownEndpointNotAdded").detail("Token", token).detail("Index", index).backtrace();
TraceEvent(SevWarnAlways, "WellKnownEndpointNotAdded")
.detail("Token", token)
.detail("Index", index)
.backtrace();
}
if (index < data.size() && data[index].token().first() == token.first() &&
((data[index].token().second() & 0xffffffff00000000LL) | index) == token.second())
@ -923,6 +926,7 @@ ACTOR static void deliver(TransportData* self,
// ReadSocket) we can just upgrade. Otherwise we'll context switch so that we don't block other tasks that might run
// with a higher priority. ReplyPromiseStream needs to guarentee that messages are recieved in the order they were
// sent, so we are using orderedDelay.
// NOTE: don't skip delay(0) when it's local deliver since it could cause out of order object deconstruction.
if (priority < TaskPriority::ReadSocket || !inReadSocket) {
wait(orderedDelay(0, priority));
} else {

View File

@ -25,6 +25,7 @@
#include <ctime>
#include "flow/flow.h"
#include "flow/WriteOnlySet.h"
#include "fdbrpc/IRateControl.h"
// All outstanding operations must be cancelled before the destructor of IAsyncFile is called.
@ -119,6 +120,11 @@ public:
// Returns the time of the last modification of the file.
virtual Future<std::time_t> lastWriteTime(const std::string& filename) = 0;
#ifdef ENABLE_SAMPLING
// Returns the shared memory data structure used to store actor lineages.
virtual ActorLineageSet& getActorLineageSet() = 0;
#endif
static IAsyncFileSystem* filesystem() { return filesystem(g_network); }
static runCycleFuncPtr runCycleFunc() {
return reinterpret_cast<runCycleFuncPtr>(

View File

@ -71,6 +71,7 @@ struct ProcessClass {
Ratekeeper,
StorageCache,
Backup,
Worker, // used for actor lineage tracking
NoRole
};
enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 };

View File

@ -98,6 +98,12 @@ Future<std::time_t> Net2FileSystem::lastWriteTime(const std::string& filename) {
return Net2AsyncFile::lastWriteTime(filename);
}
#ifdef ENABLE_SAMPLING
ActorLineageSet& Net2FileSystem::getActorLineageSet() {
return actorLineageSet;
}
#endif
void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) {
g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath));
}

View File

@ -39,6 +39,10 @@ public:
Future<Void> renameFile(std::string const& from, std::string const& to) override;
#ifdef ENABLE_SAMPLING
ActorLineageSet& getActorLineageSet() override;
#endif
// void init();
static void stop();
@ -52,6 +56,9 @@ public:
dev_t fileSystemDeviceId;
bool checkFileSystem;
#endif
#ifdef ENABLE_SAMPLING
ActorLineageSet actorLineageSet;
#endif
};
#endif

View File

@ -300,7 +300,7 @@ struct AcknowledgementReceiver final : FlowReceiver, FastAllocated<Acknowledgeme
Promise<Void> hold = ready;
hold.sendError(message.getError());
} else {
ASSERT(message.get().bytes > bytesAcknowledged);
ASSERT(message.get().bytes > bytesAcknowledged || (message.get().bytes < 0 && bytesAcknowledged > 0));
bytesAcknowledged = message.get().bytes;
if (ready.isValid() && bytesSent - bytesAcknowledged < bytesLimit) {
Promise<Void> hold = ready;
@ -393,7 +393,8 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
false);
}
if (isRemoteEndpoint() && !sentError && !acknowledgements.failures.isReady()) {
// The ReplyPromiseStream was cancelled before sending an error, so the storage server must have died
// Notify the client ReplyPromiseStream was cancelled before sending an error, so the storage server must
// have died
FlowTransport::transport().sendUnreliable(SerializeSource<ErrorOr<EnsureTable<T>>>(broken_promise()),
getEndpoint(TaskPriority::ReadSocket),
false);
@ -413,6 +414,7 @@ public:
void send(U&& value) const {
if (queue->isRemoteEndpoint()) {
if (!queue->acknowledgements.getRawEndpoint().isValid()) {
// register acknowledge receiver on sender and tell the receiver where to send acknowledge messages
value.acknowledgeToken = queue->acknowledgements.getEndpoint(TaskPriority::ReadSocket).token;
}
queue->acknowledgements.bytesSent += value.expectedSize();
@ -474,6 +476,8 @@ public:
errors->delPromiseRef();
}
// The endpoints of a ReplyPromiseStream must be initialized at Task::ReadSocket, because with lower priorities
// a delay(0) in FlowTransport deliver can cause out of order delivery.
const Endpoint& getEndpoint() const { return queue->getEndpoint(TaskPriority::ReadSocket); }
bool operator==(const ReplyPromiseStream<T>& rhs) const { return queue == rhs.queue; }

View File

@ -197,7 +197,7 @@ struct PeerHolder {
}
};
// Implements getRepyStream, this a void actor with the same lifetime as the input ReplyPromiseStream.
// Implements getReplyStream, this a void actor with the same lifetime as the input ReplyPromiseStream.
// Because this actor holds a reference to the stream, normally it would be impossible to know when there are no other
// references. To get around this, there is a SAV inside the stream that has one less promise reference than it should
// (caused by getErrorFutureAndDelPromiseRef()). When that SAV gets a broken promise because no one besides this void

View File

@ -31,6 +31,7 @@
#include "flow/IThreadPool.h"
#include "flow/ProtocolVersion.h"
#include "flow/Util.h"
#include "flow/WriteOnlySet.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbrpc/AsyncFileCached.actor.h"
#include "fdbrpc/AsyncFileEncrypted.h"
@ -984,6 +985,12 @@ public:
bool checkRunnable() override { return net2->checkRunnable(); }
#ifdef ENABLE_SAMPLING
ActorLineageSet& getActorLineageSet() override {
return actorLineageSet;
}
#endif
void stop() override { isStopped = true; }
void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
bool isSimulated() const override { return true; }
@ -1893,7 +1900,7 @@ public:
KillType ktResult, ktMin = kt;
for (auto& datacenterMachine : datacenterMachines) {
if (deterministicRandom()->random01() < 0.99) {
if (deterministicRandom()->random01() < 0.99 || forceKill) {
killMachine(datacenterMachine.first, kt, true, &ktResult);
if (ktResult != kt) {
TraceEvent(SevWarn, "KillDCFail")
@ -2127,6 +2134,10 @@ public:
// Whether or not yield has returned true during the current iteration of the run loop
bool yielded;
int yield_limit; // how many more times yield may return false before next returning true
#ifdef ENABLE_SAMPLING
ActorLineageSet actorLineageSet;
#endif
};
class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
@ -2519,6 +2530,12 @@ Future<std::time_t> Sim2FileSystem::lastWriteTime(const std::string& filename) {
return fileWrites[filename];
}
#ifdef ENABLE_SAMPLING
ActorLineageSet& Sim2FileSystem::getActorLineageSet() {
return actorLineageSet;
}
#endif
void Sim2FileSystem::newFileSystem() {
g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem());
}

View File

@ -409,6 +409,7 @@ public:
std::vector<Optional<Standalone<StringRef>>> primarySatelliteDcIds;
std::vector<Optional<Standalone<StringRef>>> remoteSatelliteDcIds;
TSSMode tssMode;
ConfigDBType configDBType;
// Used by workloads that perform reconfigurations
int testerCount;
@ -480,6 +481,10 @@ public:
Future<std::time_t> lastWriteTime(const std::string& filename) override;
#ifdef ENABLE_SAMPLING
ActorLineageSet& getActorLineageSet() override;
#endif
Future<Void> renameFile(std::string const& from, std::string const& to) override;
Sim2FileSystem() {}
@ -487,6 +492,10 @@ public:
~Sim2FileSystem() override {}
static void newFileSystem();
#ifdef ENABLE_SAMPLING
ActorLineageSet actorLineageSet;
#endif
};
#endif

View File

@ -131,8 +131,8 @@ void applyMetadataMutations(SpanID const& spanContext,
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivateMutation", dbgid)
.detail("Original", m.toString())
.detail("Privatized", privatized.toString())
.detail("Original", m)
.detail("Privatized", privatized)
.detail("Server", serverKeysDecodeServer(m.param1))
.detail("TagKey", serverTagKeyFor(serverKeysDecodeServer(m.param1)))
.detail("Tag", tag.toString());
@ -218,8 +218,8 @@ void applyMetadataMutations(SpanID const& spanContext,
(!m.param1.startsWith(failedLocalityPrefix) && m.param1 != failedLocalityVersionKey)) {
auto t = txnStateStore->readValue(m.param1).get();
TraceEvent("MutationRequiresRestart", dbgid)
.detail("M", m.toString())
.detail("PrevValue", t.present() ? t.get() : LiteralStringRef("(none)"))
.detail("M", m)
.detail("PrevValue", t.orDefault("(none)"_sr))
.detail("ToCommit", toCommit != nullptr);
confChange = true;
}
@ -431,7 +431,7 @@ void applyMetadataMutations(SpanID const& spanContext,
txnStateStore->clear(range & configKeys);
if (!excludedServersKeys.contains(range) && !failedServersKeys.contains(range) &&
!excludedLocalityKeys.contains(range) && !failedLocalityKeys.contains(range)) {
TraceEvent("MutationRequiresRestart", dbgid).detail("M", m.toString());
TraceEvent("MutationRequiresRestart", dbgid).detail("M", m);
confChange = true;
}
}

View File

@ -744,7 +744,6 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
DEBUG_MUTATION("addMutation", message.version.version, m)
.detail("Version", message.version.toString())
.detail("Mutation", m)
.detail("KCV", self->minKnownCommittedVersion)
.detail("SavedVersion", self->savedVersion);

View File

@ -99,8 +99,11 @@ set(FDBSERVER_SRCS
RestoreWorkerInterface.actor.h
Resolver.actor.cpp
ResolverInterface.h
RoleLineage.actor.h
RoleLineage.actor.cpp
ServerDBInfo.actor.h
ServerDBInfo.h
SigStack.cpp
SimpleConfigConsumer.actor.cpp
SimpleConfigConsumer.h
SimulatedCluster.actor.cpp
@ -144,6 +147,7 @@ set(FDBSERVER_SRCS
workloads/BackupCorrectness.actor.cpp
workloads/BackupAndParallelRestoreCorrectness.actor.cpp
workloads/ClogSingleConnection.actor.cpp
workloads/ConfigIncrement.actor.cpp
workloads/BackupToBlob.actor.cpp
workloads/BackupToDBAbort.actor.cpp
workloads/BackupToDBCorrectness.actor.cpp
@ -304,6 +308,7 @@ else()
endif()
target_link_libraries(fdbserver PRIVATE toml11_target jemalloc)
# target_compile_definitions(fdbserver PRIVATE -DENABLE_SAMPLING)
if (GPERFTOOLS_FOUND)
target_link_libraries(fdbserver PRIVATE gperftools)

View File

@ -3148,7 +3148,6 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster, ClusterC
dbInfo.distributor = db->serverInfo->get().distributor;
dbInfo.ratekeeper = db->serverInfo->get().ratekeeper;
dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;
dbInfo.configBroadcaster = db->serverInfo->get().configBroadcaster;
TraceEvent("CCWDB", cluster->id)
.detail("Lifetime", dbInfo.masterLifetime.toString())
@ -3734,7 +3733,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
checkOutstandingRequests(self);
}
void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self, ConfigBroadcaster* configBroadcaster) {
const WorkerInterface& w = req.wi;
ProcessClass newProcessClass = req.processClass;
auto info = self->id_worker.find(w.locality.processId());
@ -3823,6 +3822,13 @@ void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
w.locality.processId() == self->db.serverInfo->get().master.locality.processId()) {
self->masterProcessId = w.locality.processId();
}
if (configBroadcaster != nullptr) {
self->addActor.send(configBroadcaster->registerWorker(
req.lastSeenKnobVersion,
req.knobConfigClassSet,
self->id_worker[w.locality.processId()].watcher,
self->id_worker[w.locality.processId()].details.interf.configBroadcastInterface));
}
checkOutstandingRequests(self);
} else if (info->second.details.interf.id() != w.id() || req.generation >= info->second.gen) {
if (!info->second.reply.isSet()) {
@ -3841,6 +3847,13 @@ void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
info->second.details.interf = w;
info->second.watcher = workerAvailabilityWatch(w, newProcessClass, self);
}
if (configBroadcaster != nullptr) {
self->addActor.send(
configBroadcaster->registerWorker(req.lastSeenKnobVersion,
req.knobConfigClassSet,
info->second.watcher,
info->second.details.interf.configBroadcastInterface));
}
checkOutstandingRequests(self);
} else {
TEST(true); // Received an old worker registration request.
@ -3925,13 +3938,11 @@ ACTOR Future<Void> timeKeeper(ClusterControllerData* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
loop {
try {
state UID debugID = deterministicRandom()->randomUniqueID();
if (!g_network->isSimulated()) {
// This is done to provide an arbitrary logged transaction every ~10s.
// FIXME: replace or augment this with logging on the proxy which tracks
// how long it is taking to hear responses from each other component.
UID debugID = deterministicRandom()->randomUniqueID();
TraceEvent("TimeKeeperCommit", debugID).log();
// how long it is taking to hear responses from each other component.
tr->debugTransaction(debugID);
}
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -3946,7 +3957,9 @@ ACTOR Future<Void> timeKeeper(ClusterControllerData* self) {
Version v = tr->getReadVersion().get();
int64_t currentTime = (int64_t)now();
versionMap.set(tr, currentTime, v);
if (!g_network->isSimulated()) {
TraceEvent("TimeKeeperCommit", debugID).detail("Version", v);
}
int64_t ttl = currentTime - SERVER_KNOBS->TIME_KEEPER_DELAY * SERVER_KNOBS->TIME_KEEPER_MAX_ENTRIES;
if (ttl > 0) {
versionMap.erase(tr, 0, ttl);
@ -4775,22 +4788,19 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
Future<Void> leaderFail,
ServerCoordinators coordinators,
LocalityData locality,
UseConfigDB useConfigDB) {
ConfigDBType configDBType) {
state ClusterControllerData self(interf, locality, coordinators);
state ConfigBroadcaster configBroadcaster(coordinators, useConfigDB);
state ConfigBroadcaster configBroadcaster(coordinators, configDBType);
state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
state uint64_t step = 0;
state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
if (useConfigDB != UseConfigDB::DISABLED) {
self.addActor.send(configBroadcaster.serve(self.db.serverInfo->get().configBroadcaster));
}
self.addActor.send(clusterWatchDatabase(&self, &self.db)); // Start the master database
self.addActor.send(self.updateWorkerList.init(self.db.db));
self.addActor.send(statusServer(interf.clientInterface.databaseStatus.getFuture(),
&self,
coordinators,
(useConfigDB == UseConfigDB::DISABLED) ? nullptr : &configBroadcaster));
(configDBType == ConfigDBType::DISABLED) ? nullptr : &configBroadcaster));
self.addActor.send(timeKeeper(&self));
self.addActor.send(monitorProcessClasses(&self));
self.addActor.send(monitorServerInfoConfig(&self.db));
@ -4842,24 +4852,24 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
}
when(RegisterWorkerRequest req = waitNext(interf.registerWorker.getFuture())) {
++self.registerWorkerRequests;
registerWorker(req, &self);
registerWorker(req, &self, (configDBType == ConfigDBType::DISABLED) ? nullptr : &configBroadcaster);
}
when(GetWorkersRequest req = waitNext(interf.getWorkers.getFuture())) {
++self.getWorkersRequests;
vector<WorkerDetails> workers;
for (auto& it : self.id_worker) {
for (auto const& [id, worker] : self.id_worker) {
if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) &&
self.db.config.isExcludedServer(it.second.details.interf.addresses())) {
self.db.config.isExcludedServer(worker.details.interf.addresses())) {
continue;
}
if ((req.flags & GetWorkersRequest::TESTER_CLASS_ONLY) &&
it.second.details.processClass.classType() != ProcessClass::TesterClass) {
worker.details.processClass.classType() != ProcessClass::TesterClass) {
continue;
}
workers.push_back(it.second.details);
workers.push_back(worker.details);
}
req.reply.send(workers);
@ -4918,7 +4928,7 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
LocalityData locality,
UseConfigDB useConfigDB) {
ConfigDBType configDBType) {
loop {
state ClusterControllerFullInterface cci;
state bool inRole = false;
@ -4945,7 +4955,7 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
inRole = true;
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, useConfigDB));
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType));
}
} catch (Error& e) {
if (inRole)
@ -4969,13 +4979,13 @@ ACTOR Future<Void> clusterController(Reference<ClusterConnectionFile> connFile,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
Future<Void> recoveredDiskFiles,
LocalityData locality,
UseConfigDB useConfigDB) {
ConfigDBType configDBType) {
wait(recoveredDiskFiles);
state bool hasConnected = false;
loop {
try {
ServerCoordinators coordinators(connFile);
wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, useConfigDB));
wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
} catch (Error& e) {
if (e.code() != error_code_coordinators_changed)
throw; // Expected to terminate fdbserver

View File

@ -28,6 +28,7 @@
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbrpc/sim_validation.h"
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/ConflictSet.h"
@ -953,10 +954,8 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
pProxyCommitData->singleKeyMutationEvent->log();
}
DEBUG_MUTATION("ProxyCommit", self->commitVersion, m)
.detail("Dbgid", pProxyCommitData->dbgid)
.detail("To", tags)
.detail("Mutation", m);
DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid)
.detail("To", tags);
self->toCommit.addTags(tags);
if (pProxyCommitData->cacheInfo[m.param1]) {
self->toCommit.addTag(cacheTag);
@ -969,10 +968,8 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
++firstRange;
if (firstRange == ranges.end()) {
// Fast path
DEBUG_MUTATION("ProxyCommit", self->commitVersion, m)
.detail("Dbgid", pProxyCommitData->dbgid)
.detail("To", ranges.begin().value().tags)
.detail("Mutation", m);
DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid)
.detail("To", ranges.begin().value().tags);
ranges.begin().value().populateTags();
self->toCommit.addTags(ranges.begin().value().tags);
@ -1007,10 +1004,8 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
trCost->get().clearIdxCosts.pop_front();
}
}
DEBUG_MUTATION("ProxyCommit", self->commitVersion, m)
.detail("Dbgid", pProxyCommitData->dbgid)
.detail("To", allSources)
.detail("Mutation", m);
DEBUG_MUTATION("ProxyCommit", self->commitVersion, m, pProxyCommitData->dbgid)
.detail("To", allSources);
self->toCommit.addTags(allSources);
}
@ -1418,6 +1413,7 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
// WARNING: this code is run at a high priority (until the first delay(0)), so it needs to do as little work as
// possible
state CommitBatch::CommitBatchContext context(self, trs, currentBatchMemBytesCount);
getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::Commit;
// Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches)
// so we need to downgrade here
@ -1468,6 +1464,8 @@ void maybeAddTssMapping(GetKeyServerLocationsReply& reply,
ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
// We can't respond to these requests until we have valid txnStateStore
getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations;
getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first();
wait(commitData->validState.getFuture());
wait(delay(0, TaskPriority::DefaultEndpoint));
@ -1543,7 +1541,7 @@ ACTOR static Future<Void> rejoinServer(CommitProxyInterface proxy, ProxyCommitDa
// We can't respond to these requests until we have valid txnStateStore
wait(commitData->validState.getFuture());
TraceEvent("ProxyReadyForReads", proxy.id());
TraceEvent("ProxyReadyForReads", proxy.id()).log();
loop {
GetStorageServerRejoinInfoRequest req = waitNext(proxy.getStorageServerRejoinInfo.getFuture());
@ -1985,7 +1983,7 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
state KeyRange txnKeys = allKeys;
RangeResult UIDtoTagMap = commitData.txnStateStore->readRange(serverTagKeys).get();
state std::map<Tag, UID> tag_uid;
for (const KeyValueRef kv : UIDtoTagMap) {
for (const KeyValueRef& kv : UIDtoTagMap) {
tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
}
loop {

View File

@ -1,139 +0,0 @@
/*
* ConfigBroadcastFollowerInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbserver/ConfigFollowerInterface.h"
class ConfigClassSet {
std::set<Key> classes;
public:
static constexpr FileIdentifier file_identifier = 9854021;
bool operator==(ConfigClassSet const& rhs) const { return classes == rhs.classes; }
bool operator!=(ConfigClassSet const& rhs) const { return !(*this == rhs); }
ConfigClassSet() = default;
ConfigClassSet(VectorRef<KeyRef> configClasses) {
for (const auto& configClass : configClasses) {
classes.insert(configClass);
}
}
bool contains(KeyRef configClass) const { return classes.count(configClass); }
std::set<Key> const& getClasses() const { return classes; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, classes);
}
};
template <>
struct Traceable<ConfigClassSet> : std::true_type {
static std::string toString(ConfigClassSet const& value) { return describe(value.getClasses()); }
};
struct ConfigBroadcastFollowerGetSnapshotReply {
static constexpr FileIdentifier file_identifier = 8701983;
Version version{ 0 };
std::map<ConfigKey, KnobValue> snapshot;
ConfigBroadcastFollowerGetSnapshotReply() = default;
template <class Snapshot>
explicit ConfigBroadcastFollowerGetSnapshotReply(Version version, Snapshot&& snapshot)
: version(version), snapshot(std::forward<Snapshot>(snapshot)) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, snapshot);
}
};
struct ConfigBroadcastFollowerGetSnapshotRequest {
static constexpr FileIdentifier file_identifier = 10911924;
ConfigClassSet configClassSet;
ReplyPromise<ConfigBroadcastFollowerGetSnapshotReply> reply;
ConfigBroadcastFollowerGetSnapshotRequest() = default;
explicit ConfigBroadcastFollowerGetSnapshotRequest(ConfigClassSet const& configClassSet)
: configClassSet(configClassSet) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, configClassSet, reply);
}
};
struct ConfigBroadcastFollowerGetChangesReply {
static constexpr FileIdentifier file_identifier = 4014927;
Version mostRecentVersion;
Standalone<VectorRef<VersionedConfigMutationRef>> changes;
ConfigBroadcastFollowerGetChangesReply()=default;
explicit ConfigBroadcastFollowerGetChangesReply(Version mostRecentVersion, Standalone<VectorRef<VersionedConfigMutationRef>> const& changes)
: mostRecentVersion(mostRecentVersion), changes(changes) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, mostRecentVersion, changes);
}
};
struct ConfigBroadcastFollowerGetChangesRequest {
static constexpr FileIdentifier file_identifier = 601280;
Version lastSeenVersion;
ConfigClassSet configClassSet;
ReplyPromise<ConfigBroadcastFollowerGetChangesReply> reply;
ConfigBroadcastFollowerGetChangesRequest() = default;
explicit ConfigBroadcastFollowerGetChangesRequest(Version lastSeenVersion, ConfigClassSet const& configClassSet)
: lastSeenVersion(lastSeenVersion), configClassSet(configClassSet) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, lastSeenVersion, configClassSet, reply);
}
};
/*
* The ConfigBroadcaster serves a ConfigBroadcastFollowerInterface which all
* workers use to fetch updates.
*/
class ConfigBroadcastFollowerInterface {
UID _id;
public:
static constexpr FileIdentifier file_identifier = 1984391;
RequestStream<ConfigBroadcastFollowerGetSnapshotRequest> getSnapshot;
RequestStream<ConfigBroadcastFollowerGetChangesRequest> getChanges;
ConfigBroadcastFollowerInterface() : _id(deterministicRandom()->randomUniqueID()) {}
bool operator==(ConfigBroadcastFollowerInterface const& rhs) const { return (_id == rhs._id); }
bool operator!=(ConfigBroadcastFollowerInterface const& rhs) const { return !(*this == rhs); }
UID id() const { return _id; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, _id, getSnapshot, getChanges);
}
};

View File

@ -0,0 +1,138 @@
/*
* ConfigBroadcastInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbserver/ConfigFollowerInterface.h"
class ConfigClassSet {
std::set<Key> classes;
public:
static constexpr FileIdentifier file_identifier = 9854021;
bool operator==(ConfigClassSet const& rhs) const { return classes == rhs.classes; }
bool operator!=(ConfigClassSet const& rhs) const { return !(*this == rhs); }
ConfigClassSet() = default;
ConfigClassSet(VectorRef<KeyRef> configClasses) {
for (const auto& configClass : configClasses) {
classes.insert(configClass);
}
}
ConfigClassSet(std::vector<KeyRef> configClasses) {
for (const auto& configClass : configClasses) {
classes.insert(configClass);
}
}
bool contains(KeyRef configClass) const { return classes.count(configClass); }
std::set<Key> const& getClasses() const { return classes; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, classes);
}
};
template <>
struct Traceable<ConfigClassSet> : std::true_type {
static std::string toString(ConfigClassSet const& value) { return describe(value.getClasses()); }
};
struct ConfigBroadcastSnapshotReply {
static constexpr FileIdentifier file_identifier = 8701984;
ConfigBroadcastSnapshotReply() = default;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar);
}
};
struct ConfigBroadcastSnapshotRequest {
static constexpr FileIdentifier file_identifier = 10911925;
Version version{ 0 };
std::map<ConfigKey, KnobValue> snapshot;
ReplyPromise<ConfigBroadcastSnapshotReply> reply;
ConfigBroadcastSnapshotRequest() = default;
template <class Snapshot>
explicit ConfigBroadcastSnapshotRequest(Version version, Snapshot&& snapshot)
: version(version), snapshot(std::forward<Snapshot>(snapshot)) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, snapshot, reply);
}
};
struct ConfigBroadcastChangesReply {
static constexpr FileIdentifier file_identifier = 4014928;
ConfigBroadcastChangesReply() = default;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar);
}
};
struct ConfigBroadcastChangesRequest {
static constexpr FileIdentifier file_identifier = 601281;
Version mostRecentVersion;
Standalone<VectorRef<VersionedConfigMutationRef>> changes;
ReplyPromise<ConfigBroadcastChangesReply> reply;
ConfigBroadcastChangesRequest() = default;
explicit ConfigBroadcastChangesRequest(Version mostRecentVersion,
Standalone<VectorRef<VersionedConfigMutationRef>> const& changes)
: mostRecentVersion(mostRecentVersion), changes(changes) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, mostRecentVersion, changes, reply);
}
};
/*
* The ConfigBroadcaster uses a ConfigBroadcastInterface from each worker to
* push updates made to the configuration database to the worker.
*/
class ConfigBroadcastInterface {
UID _id;
public:
static constexpr FileIdentifier file_identifier = 1676543;
RequestStream<ConfigBroadcastSnapshotRequest> snapshot;
RequestStream<ConfigBroadcastChangesRequest> changes;
ConfigBroadcastInterface() : _id(deterministicRandom()->randomUniqueID()) {}
bool operator==(ConfigBroadcastInterface const& rhs) const { return (_id == rhs._id); }
bool operator!=(ConfigBroadcastInterface const& rhs) const { return !(*this == rhs); }
UID id() const { return _id; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, _id, snapshot, changes);
}
};

View File

@ -49,101 +49,74 @@ void remove(Container& container, K const& k) {
} // namespace
class ConfigBroadcasterImpl {
// PendingRequestStore stores a set of pending ConfigBroadcastFollowerGetChangesRequests,
// indexed by configuration class. When an update is received, replies are sent for all
// pending requests with affected configuration classes
class PendingRequestStore {
using Req = ConfigBroadcastFollowerGetChangesRequest;
std::map<Key, std::set<Endpoint::Token>> configClassToTokens;
std::map<Endpoint::Token, Req> tokenToRequest;
// Holds information about each client connected to the broadcaster.
struct BroadcastClientDetails {
// Triggered when the worker dies.
Future<Void> watcher;
ConfigClassSet configClassSet;
Version lastSeenVersion;
ConfigBroadcastInterface broadcastInterface;
public:
void addRequest(Req const& req) {
auto token = req.reply.getEndpoint().token;
tokenToRequest[token] = req;
for (const auto& configClass : req.configClassSet.getClasses()) {
configClassToTokens[configClass].insert(token);
}
bool operator==(BroadcastClientDetails const& rhs) const {
return configClassSet == rhs.configClassSet && lastSeenVersion == rhs.lastSeenVersion &&
broadcastInterface == rhs.broadcastInterface;
}
bool operator!=(BroadcastClientDetails const& rhs) const { return !(*this == rhs); }
};
std::vector<Req> getRequestsToNotify(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes) const {
std::set<Endpoint::Token> tokenSet;
for (const auto& change : changes) {
if (!change.mutation.getConfigClass().present()) {
// Update everything
for (const auto& [token, req] : tokenToRequest) {
if (req.lastSeenVersion < change.version) {
tokenSet.insert(token);
}
}
} else {
Key configClass = change.mutation.getConfigClass().get();
if (configClassToTokens.count(configClass)) {
auto tokens = get(configClassToTokens, Key(change.mutation.getConfigClass().get()));
for (const auto& token : tokens) {
auto req = get(tokenToRequest, token);
if (req.lastSeenVersion < change.version) {
tokenSet.insert(token);
} else {
TEST(true); // Worker is ahead of config broadcaster
}
}
}
}
}
std::vector<Req> result;
for (const auto& token : tokenSet) {
result.push_back(get(tokenToRequest, token));
}
return result;
}
std::vector<Req> getOutdatedRequests(Version newSnapshotVersion) {
std::vector<Req> result;
for (const auto& [token, req] : tokenToRequest) {
if (req.lastSeenVersion < newSnapshotVersion) {
result.push_back(req);
}
}
return result;
}
void removeRequest(Req const& req) {
auto token = req.reply.getEndpoint().token;
for (const auto& configClass : req.configClassSet.getClasses()) {
remove(get(configClassToTokens, configClass), token);
// TODO: Don't leak config classes
}
remove(tokenToRequest, token);
}
} pending;
std::map<ConfigKey, KnobValue> snapshot;
std::deque<VersionedConfigMutation> mutationHistory;
std::deque<VersionedConfigCommitAnnotation> annotationHistory;
Version lastCompactedVersion;
Version mostRecentVersion;
std::unique_ptr<IConfigConsumer> consumer;
Future<Void> consumerFuture;
ActorCollection actors{ false };
std::vector<BroadcastClientDetails> clients;
UID id;
CounterCollection cc;
Counter compactRequest;
mutable Counter successfulChangeRequest;
Counter successfulChangeRequest;
Counter failedChangeRequest;
Counter snapshotRequest;
Future<Void> logger;
Future<Void> pushSnapshot(ConfigBroadcasterImpl* self,
Version snapshotVersion,
BroadcastClientDetails const& client) {
if (client.lastSeenVersion >= snapshotVersion) {
return Void();
}
++snapshotRequest;
ConfigBroadcastSnapshotRequest request;
for (const auto& [key, value] : self->snapshot) {
if (matchesConfigClass(client.configClassSet, key.configClass)) {
request.snapshot[key] = value;
}
}
request.version = snapshotVersion;
TraceEvent(SevDebug, "ConfigBroadcasterSnapshotRequest", id)
.detail("Size", request.snapshot.size())
.detail("Version", request.version);
return success(client.broadcastInterface.snapshot.getReply(request));
}
template <class Changes>
void sendChangesReply(ConfigBroadcastFollowerGetChangesRequest const& req, Changes const& changes) const {
ASSERT_LT(req.lastSeenVersion, mostRecentVersion);
ConfigBroadcastFollowerGetChangesReply reply;
reply.mostRecentVersion = mostRecentVersion;
Future<Void> pushChanges(BroadcastClientDetails& client, Changes const& changes) {
// Skip if client has already seen the latest version.
if (client.lastSeenVersion >= mostRecentVersion) {
return Void();
}
ConfigBroadcastChangesRequest req;
for (const auto& versionedMutation : changes) {
if (versionedMutation.version > req.lastSeenVersion &&
matchesConfigClass(req.configClassSet, versionedMutation.mutation.getConfigClass())) {
if (versionedMutation.version > client.lastSeenVersion &&
matchesConfigClass(client.configClassSet, versionedMutation.mutation.getConfigClass())) {
TraceEvent te(SevDebug, "ConfigBroadcasterSendingChangeMutation", id);
te.detail("Version", versionedMutation.version)
.detail("ReqLastSeenVersion", req.lastSeenVersion)
.detail("ReqLastSeenVersion", client.lastSeenVersion)
.detail("ConfigClass", versionedMutation.mutation.getConfigClass())
.detail("KnobName", versionedMutation.mutation.getKnobName());
if (versionedMutation.mutation.isSet()) {
@ -152,54 +125,18 @@ class ConfigBroadcasterImpl {
te.detail("Op", "Clear");
}
reply.changes.push_back_deep(reply.changes.arena(), versionedMutation);
req.changes.push_back_deep(req.changes.arena(), versionedMutation);
}
}
req.reply.send(reply);
++successfulChangeRequest;
}
ACTOR static Future<Void> serve(ConfigBroadcaster* self,
ConfigBroadcasterImpl* impl,
ConfigBroadcastFollowerInterface cbfi) {
impl->actors.add(impl->consumer->consume(*self));
loop {
choose {
when(ConfigBroadcastFollowerGetSnapshotRequest req = waitNext(cbfi.getSnapshot.getFuture())) {
++impl->snapshotRequest;
ConfigBroadcastFollowerGetSnapshotReply reply;
for (const auto& [key, value] : impl->snapshot) {
if (matchesConfigClass(req.configClassSet, key.configClass)) {
reply.snapshot[key] = value;
}
}
reply.version = impl->mostRecentVersion;
TraceEvent(SevDebug, "ConfigBroadcasterGotSnapshotRequest", impl->id)
.detail("Size", reply.snapshot.size())
.detail("Version", reply.version);
req.reply.send(reply);
}
when(ConfigBroadcastFollowerGetChangesRequest req = waitNext(cbfi.getChanges.getFuture())) {
if (req.lastSeenVersion < impl->lastCompactedVersion) {
req.reply.sendError(version_already_compacted());
++impl->failedChangeRequest;
continue;
}
if (req.lastSeenVersion < impl->mostRecentVersion) {
impl->sendChangesReply(req, impl->mutationHistory);
} else {
TEST(req.lastSeenVersion > impl->mostRecentVersion); // Worker is ahead of ConfigBroadcaster
TraceEvent(SevDebug, "ConfigBroadcasterRegisteringChangeRequest", impl->id)
.detail("Peer", req.reply.getEndpoint().getPrimaryAddress())
.detail("MostRecentVersion", impl->mostRecentVersion)
.detail("ReqLastSeenVersion", req.lastSeenVersion)
.detail("ConfigClass", req.configClassSet);
impl->pending.addRequest(req);
}
}
when(wait(impl->actors.getResult())) { ASSERT(false); }
}
if (req.changes.size() == 0) {
return Void();
}
client.lastSeenVersion = mostRecentVersion;
req.mostRecentVersion = mostRecentVersion;
++successfulChangeRequest;
return success(client.broadcastInterface.changes.getReply(req));
}
ConfigBroadcasterImpl()
@ -211,25 +148,6 @@ class ConfigBroadcasterImpl {
"ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigBroadcasterMetrics");
}
void notifyFollowers(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes) {
auto toNotify = pending.getRequestsToNotify(changes);
TraceEvent(SevDebug, "ConfigBroadcasterNotifyingFollowers", id)
.detail("ChangesSize", changes.size())
.detail("ToNotify", toNotify.size());
for (auto& req : toNotify) {
sendChangesReply(req, changes);
pending.removeRequest(req);
}
}
void notifyOutdatedRequests() {
auto outdated = pending.getOutdatedRequests(mostRecentVersion);
for (auto& req : outdated) {
req.reply.sendError(version_already_compacted());
pending.removeRequest(req);
}
}
void addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Version mostRecentVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations) {
@ -244,18 +162,64 @@ class ConfigBroadcasterImpl {
snapshot.erase(mutation.getKey());
}
}
for (auto& client : clients) {
actors.add(brokenPromiseToNever(pushChanges(client, changes)));
}
}
template <class Snapshot>
Future<Void> setSnapshot(Snapshot&& snapshot, Version snapshotVersion) {
this->snapshot = std::forward<Snapshot>(snapshot);
Future<Void> setSnapshot(Snapshot& snapshot, Version snapshotVersion) {
this->snapshot = snapshot;
this->lastCompactedVersion = snapshotVersion;
std::vector<Future<Void>> futures;
for (const auto& client : clients) {
futures.push_back(brokenPromiseToNever(pushSnapshot(this, snapshotVersion, client)));
}
return waitForAll(futures);
}
ACTOR template <class Snapshot>
static Future<Void> pushSnapshotAndChanges(ConfigBroadcasterImpl* self,
Snapshot snapshot,
Version snapshotVersion,
Standalone<VectorRef<VersionedConfigMutationRef>> changes,
Version changesVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> annotations) {
// Make sure all snapshot messages were received before sending changes.
wait(self->setSnapshot(snapshot, snapshotVersion));
self->addChanges(changes, changesVersion, annotations);
return Void();
}
ACTOR static Future<Void> waitForFailure(ConfigBroadcasterImpl* self,
Future<Void> watcher,
BroadcastClientDetails* client) {
wait(success(watcher));
self->clients.erase(std::remove(self->clients.begin(), self->clients.end(), *client));
return Void();
}
public:
Future<Void> serve(ConfigBroadcaster* self, ConfigBroadcastFollowerInterface const& cbfi) {
return serve(self, this, cbfi);
Future<Void> registerWorker(ConfigBroadcaster* self,
Version lastSeenVersion,
ConfigClassSet configClassSet,
Future<Void> watcher,
ConfigBroadcastInterface broadcastInterface) {
if (!consumerFuture.isValid()) {
consumerFuture = consumer->consume(*self);
}
auto client = BroadcastClientDetails{
watcher, std::move(configClassSet), lastSeenVersion, std::move(broadcastInterface)
};
// Push all dynamic knobs to worker if it isn't up to date.
Future<Void> result = pushSnapshot(this, mostRecentVersion, client);
clients.push_back(std::move(client));
actors.add(waitForFailure(this, watcher, &clients.back()));
return result;
}
void applyChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
@ -267,7 +231,6 @@ public:
.detail("NewMostRecentVersion", mostRecentVersion)
.detail("AnnotationsSize", annotations.size());
addChanges(changes, mostRecentVersion, annotations);
notifyFollowers(changes);
}
template <class Snapshot>
@ -283,9 +246,7 @@ public:
.detail("ChangesSize", changes.size())
.detail("ChangesVersion", changesVersion)
.detail("AnnotationsSize", annotations.size());
setSnapshot(std::forward<Snapshot>(snapshot), snapshotVersion);
addChanges(changes, changesVersion, annotations);
notifyOutdatedRequests();
actors.add(pushSnapshotAndChanges(this, snapshot, snapshotVersion, changes, changesVersion, annotations));
}
ConfigBroadcasterImpl(ConfigFollowerInterface const& cfi) : ConfigBroadcasterImpl() {
@ -293,16 +254,16 @@ public:
TraceEvent(SevDebug, "ConfigBroadcasterStartingConsumer", id).detail("Consumer", consumer->getID());
}
ConfigBroadcasterImpl(ServerCoordinators const& coordinators, UseConfigDB useConfigDB) : ConfigBroadcasterImpl() {
if (useConfigDB != UseConfigDB::DISABLED) {
if (useConfigDB == UseConfigDB::SIMPLE) {
ConfigBroadcasterImpl(ServerCoordinators const& coordinators, ConfigDBType configDBType) : ConfigBroadcasterImpl() {
if (configDBType != ConfigDBType::DISABLED) {
if (configDBType == ConfigDBType::SIMPLE) {
consumer = IConfigConsumer::createSimple(coordinators, 0.5, Optional<double>{});
} else {
consumer = IConfigConsumer::createPaxos(coordinators, 0.5, Optional<double>{});
}
TraceEvent(SevDebug, "BroadcasterStartingConsumer", id)
.detail("Consumer", consumer->getID())
.detail("UsingSimpleConsumer", useConfigDB == UseConfigDB::SIMPLE);
.detail("UsingSimpleConsumer", configDBType == ConfigDBType::SIMPLE);
}
}
@ -361,16 +322,18 @@ public:
}
}
Future<Void> getError() const { return consumerFuture || actors.getResult(); }
UID getID() const { return id; }
static void runPendingRequestStoreTest(bool includeGlobalMutation, int expectedMatches);
};
ConfigBroadcaster::ConfigBroadcaster(ConfigFollowerInterface const& cfi)
: _impl(std::make_unique<ConfigBroadcasterImpl>(cfi)) {}
: impl(PImpl<ConfigBroadcasterImpl>::create(cfi)) {}
ConfigBroadcaster::ConfigBroadcaster(ServerCoordinators const& coordinators, UseConfigDB useConfigDB)
: _impl(std::make_unique<ConfigBroadcasterImpl>(coordinators, useConfigDB)) {}
ConfigBroadcaster::ConfigBroadcaster(ServerCoordinators const& coordinators, ConfigDBType configDBType)
: impl(PImpl<ConfigBroadcasterImpl>::create(coordinators, configDBType)) {}
ConfigBroadcaster::ConfigBroadcaster(ConfigBroadcaster&&) = default;
@ -378,14 +341,17 @@ ConfigBroadcaster& ConfigBroadcaster::operator=(ConfigBroadcaster&&) = default;
ConfigBroadcaster::~ConfigBroadcaster() = default;
Future<Void> ConfigBroadcaster::serve(ConfigBroadcastFollowerInterface const& cbfi) {
return impl().serve(this, cbfi);
Future<Void> ConfigBroadcaster::registerWorker(Version lastSeenVersion,
ConfigClassSet configClassSet,
Future<Void> watcher,
ConfigBroadcastInterface broadcastInterface) {
return impl->registerWorker(this, lastSeenVersion, std::move(configClassSet), watcher, broadcastInterface);
}
void ConfigBroadcaster::applyChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Version mostRecentVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations) {
impl().applyChanges(changes, mostRecentVersion, annotations);
impl->applyChanges(changes, mostRecentVersion, annotations);
}
void ConfigBroadcaster::applySnapshotAndChanges(
@ -394,7 +360,7 @@ void ConfigBroadcaster::applySnapshotAndChanges(
Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Version changesVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations) {
impl().applySnapshotAndChanges(snapshot, snapshotVersion, changes, changesVersion, annotations);
impl->applySnapshotAndChanges(snapshot, snapshotVersion, changes, changesVersion, annotations);
}
void ConfigBroadcaster::applySnapshotAndChanges(
@ -403,76 +369,21 @@ void ConfigBroadcaster::applySnapshotAndChanges(
Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Version changesVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations) {
impl().applySnapshotAndChanges(std::move(snapshot), snapshotVersion, changes, changesVersion, annotations);
impl->applySnapshotAndChanges(std::move(snapshot), snapshotVersion, changes, changesVersion, annotations);
}
Future<Void> ConfigBroadcaster::getError() const {
return impl->getError();
}
UID ConfigBroadcaster::getID() const {
return impl().getID();
return impl->getID();
}
JsonBuilderObject ConfigBroadcaster::getStatus() const {
return impl().getStatus();
return impl->getStatus();
}
void ConfigBroadcaster::compact(Version compactionVersion) {
impl().compact(compactionVersion);
}
namespace {
Standalone<VectorRef<VersionedConfigMutationRef>> getTestChanges(Version version, bool includeGlobalMutation) {
Standalone<VectorRef<VersionedConfigMutationRef>> changes;
if (includeGlobalMutation) {
ConfigKey key = ConfigKeyRef({}, "test_long"_sr);
auto value = KnobValue::create(int64_t{ 5 });
ConfigMutation mutation = ConfigMutationRef(key, value.contents());
changes.emplace_back_deep(changes.arena(), version, mutation);
}
{
ConfigKey key = ConfigKeyRef("class-A"_sr, "test_long"_sr);
auto value = KnobValue::create(int64_t{ 5 });
ConfigMutation mutation = ConfigMutationRef(key, value.contents());
changes.emplace_back_deep(changes.arena(), version, mutation);
}
return changes;
}
ConfigBroadcastFollowerGetChangesRequest getTestRequest(Version lastSeenVersion,
std::vector<KeyRef> const& configClasses) {
Standalone<VectorRef<KeyRef>> configClassesVector;
for (const auto& configClass : configClasses) {
configClassesVector.push_back_deep(configClassesVector.arena(), configClass);
}
return ConfigBroadcastFollowerGetChangesRequest{ lastSeenVersion, ConfigClassSet{ configClassesVector } };
}
} // namespace
void ConfigBroadcasterImpl::runPendingRequestStoreTest(bool includeGlobalMutation, int expectedMatches) {
PendingRequestStore pending;
for (Version v = 0; v < 5; ++v) {
pending.addRequest(getTestRequest(v, {}));
pending.addRequest(getTestRequest(v, { "class-A"_sr }));
pending.addRequest(getTestRequest(v, { "class-B"_sr }));
pending.addRequest(getTestRequest(v, { "class-A"_sr, "class-B"_sr }));
}
auto toNotify = pending.getRequestsToNotify(getTestChanges(0, includeGlobalMutation));
ASSERT_EQ(toNotify.size(), 0);
for (Version v = 1; v <= 5; ++v) {
auto toNotify = pending.getRequestsToNotify(getTestChanges(v, includeGlobalMutation));
ASSERT_EQ(toNotify.size(), expectedMatches);
for (const auto& req : toNotify) {
pending.removeRequest(req);
}
}
}
TEST_CASE("/fdbserver/ConfigDB/ConfigBroadcaster/Internal/PendingRequestStore/Simple") {
ConfigBroadcasterImpl::runPendingRequestStoreTest(false, 2);
return Void();
}
TEST_CASE("/fdbserver/ConfigDB/ConfigBroadcaster/Internal/PendingRequestStore/GlobalMutation") {
ConfigBroadcasterImpl::runPendingRequestStoreTest(true, 4);
return Void();
impl->compact(compactionVersion);
}

View File

@ -22,28 +22,31 @@
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/JsonBuilder.h"
#include "fdbclient/PImpl.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/ConfigBroadcastFollowerInterface.h"
#include "fdbserver/ConfigBroadcastInterface.h"
#include "fdbserver/ConfigFollowerInterface.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/flow.h"
#include <memory>
/*
* The configuration broadcaster runs on the cluster controller. The broadcaster listens uses
* an IConfigConsumer instantitation to consume updates from the configuration database, and broadcasts
* an IConfigConsumer instantiation to consume updates from the configuration database, and broadcasts
* these updates to all workers' local configurations
*/
class ConfigBroadcaster {
std::unique_ptr<class ConfigBroadcasterImpl> _impl;
ConfigBroadcasterImpl& impl() { return *_impl; }
ConfigBroadcasterImpl const& impl() const { return *_impl; }
PImpl<class ConfigBroadcasterImpl> impl;
public:
explicit ConfigBroadcaster(ServerCoordinators const&, UseConfigDB);
explicit ConfigBroadcaster(ServerCoordinators const&, ConfigDBType);
ConfigBroadcaster(ConfigBroadcaster&&);
ConfigBroadcaster& operator=(ConfigBroadcaster&&);
~ConfigBroadcaster();
Future<Void> serve(ConfigBroadcastFollowerInterface const&);
Future<Void> registerWorker(Version lastSeenVersion,
ConfigClassSet configClassSet,
Future<Void> watcher,
ConfigBroadcastInterface worker);
void applyChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Version mostRecentVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations);
@ -57,6 +60,7 @@ public:
Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,
Version changesVersion,
Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> const& annotations);
Future<Void> getError() const;
UID getID() const;
JsonBuilderObject getStatus() const;
void compact(Version compactionVersion);

View File

@ -123,11 +123,11 @@ public:
class ReadFromLocalConfigEnvironment {
UID id;
std::string dataDir;
LocalConfiguration localConfiguration;
Reference<IAsyncListener<ConfigBroadcastFollowerInterface> const> cbfi;
Reference<LocalConfiguration> localConfiguration;
Reference<AsyncVar<ConfigBroadcastInterface> const> cbi;
Future<Void> consumer;
ACTOR static Future<Void> checkEventually(LocalConfiguration const* localConfiguration,
ACTOR static Future<Void> checkEventually(Reference<LocalConfiguration const> localConfiguration,
Optional<int64_t> expected) {
state double lastMismatchTime = now();
loop {
@ -145,9 +145,17 @@ class ReadFromLocalConfigEnvironment {
}
ACTOR static Future<Void> setup(ReadFromLocalConfigEnvironment* self) {
wait(self->localConfiguration.initialize());
if (self->cbfi) {
self->consumer = self->localConfiguration.consume(self->cbfi);
wait(self->localConfiguration->initialize());
if (self->cbi) {
// LocalConfiguration runs in a loop waiting for messages from the
// broadcaster. These unit tests use the same
// ConfigBroadcastInterface across restarts, so when "killing" the
// old LocalConfiguration, it's necessary to make sure it is
// completely stopped before starting the second config. This
// prevents two actors trying to listen for the same message on the
// same interface, causing lots of issues!
self->consumer.cancel();
self->consumer = self->localConfiguration->consume(self->cbi->get());
}
return Void();
}
@ -156,37 +164,43 @@ public:
ReadFromLocalConfigEnvironment(std::string const& dataDir,
std::string const& configPath,
std::map<std::string, std::string> const& manualKnobOverrides)
: dataDir(dataDir), localConfiguration(dataDir, configPath, manualKnobOverrides, IsTest::True),
: dataDir(dataDir),
localConfiguration(makeReference<LocalConfiguration>(dataDir, configPath, manualKnobOverrides, IsTest::True)),
consumer(Never()) {}
Future<Void> setup() { return setup(this); }
Future<Void> restartLocalConfig(std::string const& newConfigPath) {
localConfiguration = LocalConfiguration(dataDir, newConfigPath, {}, IsTest::True);
std::map<std::string, std::string> manualKnobOverrides = {};
localConfiguration =
makeReference<LocalConfiguration>(dataDir, newConfigPath, manualKnobOverrides, IsTest::True);
return setup();
}
void connectToBroadcaster(Reference<IAsyncListener<ConfigBroadcastFollowerInterface> const> const& cbfi) {
ASSERT(!this->cbfi);
this->cbfi = cbfi;
consumer = localConfiguration.consume(cbfi);
void connectToBroadcaster(Reference<AsyncVar<ConfigBroadcastInterface> const> const& cbi) {
this->cbi = cbi;
consumer = localConfiguration->consume(cbi->get());
}
void checkImmediate(Optional<int64_t> expected) const {
if (expected.present()) {
ASSERT_EQ(localConfiguration.getTestKnobs().TEST_LONG, expected.get());
ASSERT_EQ(localConfiguration->getTestKnobs().TEST_LONG, expected.get());
} else {
ASSERT_EQ(localConfiguration.getTestKnobs().TEST_LONG, 0);
ASSERT_EQ(localConfiguration->getTestKnobs().TEST_LONG, 0);
}
}
Future<Void> checkEventually(Optional<int64_t> expected) const {
return checkEventually(&localConfiguration, expected);
return checkEventually(localConfiguration, expected);
}
LocalConfiguration& getMutableLocalConfiguration() { return localConfiguration; }
LocalConfiguration& getMutableLocalConfiguration() { return *localConfiguration; }
Future<Void> getError() const { return consumer; }
Version lastSeenVersion() { return localConfiguration->lastSeenVersion(); }
ConfigClassSet configClassSet() { return localConfiguration->configClassSet(); }
};
class LocalConfigEnvironment {
@ -204,7 +218,7 @@ public:
std::string const& configPath,
std::map<std::string, std::string> const& manualKnobOverrides = {})
: readFrom(dataDir, configPath, manualKnobOverrides) {}
Future<Void> setup() { return readFrom.setup(); }
Future<Void> setup(ConfigClassSet const& configClassSet) { return readFrom.setup(); }
Future<Void> restartLocalConfig(std::string const& newConfigPath) {
return readFrom.restartLocalConfig(newConfigPath);
}
@ -219,15 +233,16 @@ public:
class BroadcasterToLocalConfigEnvironment {
ReadFromLocalConfigEnvironment readFrom;
Reference<AsyncVar<ConfigBroadcastFollowerInterface>> cbfi;
Reference<AsyncVar<ConfigBroadcastInterface>> cbi;
ConfigBroadcaster broadcaster;
Version lastWrittenVersion{ 0 };
Future<Void> broadcastServer;
ACTOR static Future<Void> setup(BroadcasterToLocalConfigEnvironment* self) {
ACTOR static Future<Void> setup(BroadcasterToLocalConfigEnvironment* self, ConfigClassSet configClassSet) {
wait(self->readFrom.setup());
self->readFrom.connectToBroadcaster(IAsyncListener<ConfigBroadcastFollowerInterface>::create(self->cbfi));
self->broadcastServer = self->broadcaster.serve(self->cbfi->get());
self->cbi = makeReference<AsyncVar<ConfigBroadcastInterface>>();
self->readFrom.connectToBroadcaster(self->cbi);
self->broadcastServer = self->broadcaster.registerWorker(0, configClassSet, Never(), self->cbi->get());
return Void();
}
@ -239,10 +254,10 @@ class BroadcasterToLocalConfigEnvironment {
public:
BroadcasterToLocalConfigEnvironment(std::string const& dataDir, std::string const& configPath)
: readFrom(dataDir, configPath, {}), cbfi(makeReference<AsyncVar<ConfigBroadcastFollowerInterface>>()),
: readFrom(dataDir, configPath, {}), cbi(makeReference<AsyncVar<ConfigBroadcastInterface>>()),
broadcaster(ConfigFollowerInterface{}) {}
Future<Void> setup() { return setup(this); }
Future<Void> setup(ConfigClassSet const& configClassSet) { return setup(this, configClassSet); }
void set(Optional<KeyRef> configClass, int64_t value) {
auto knobValue = KnobValueRef::create(value);
@ -255,8 +270,10 @@ public:
void changeBroadcaster() {
broadcastServer.cancel();
cbfi->set(ConfigBroadcastFollowerInterface{});
broadcastServer = broadcaster.serve(cbfi->get());
cbi->set(ConfigBroadcastInterface{});
readFrom.connectToBroadcaster(cbi);
broadcastServer =
broadcaster.registerWorker(readFrom.lastSeenVersion(), readFrom.configClassSet(), Never(), cbi->get());
}
Future<Void> restartLocalConfig(std::string const& newConfigPath) {
@ -265,7 +282,7 @@ public:
void compact() { broadcaster.compact(lastWrittenVersion); }
Future<Void> getError() const { return readFrom.getError() || broadcastServer; }
Future<Void> getError() const { return readFrom.getError() || broadcaster.getError(); }
};
class TransactionEnvironment {
@ -356,31 +373,33 @@ public:
class TransactionToLocalConfigEnvironment {
WriteToTransactionEnvironment writeTo;
ReadFromLocalConfigEnvironment readFrom;
Reference<AsyncVar<ConfigBroadcastFollowerInterface>> cbfi;
Reference<AsyncVar<ConfigBroadcastInterface>> cbi;
ConfigBroadcaster broadcaster;
Future<Void> broadcastServer;
ACTOR static Future<Void> setup(TransactionToLocalConfigEnvironment* self) {
ACTOR static Future<Void> setup(TransactionToLocalConfigEnvironment* self, ConfigClassSet configClassSet) {
wait(self->readFrom.setup());
self->readFrom.connectToBroadcaster(IAsyncListener<ConfigBroadcastFollowerInterface>::create(self->cbfi));
self->broadcastServer = self->broadcaster.serve(self->cbfi->get());
self->cbi = makeReference<AsyncVar<ConfigBroadcastInterface>>();
self->readFrom.connectToBroadcaster(self->cbi);
self->broadcastServer = self->broadcaster.registerWorker(0, configClassSet, Never(), self->cbi->get());
return Void();
}
public:
TransactionToLocalConfigEnvironment(std::string const& dataDir, std::string const& configPath)
: writeTo(dataDir), readFrom(dataDir, configPath, {}),
cbfi(makeReference<AsyncVar<ConfigBroadcastFollowerInterface>>()), broadcaster(writeTo.getFollowerInterface()) {
}
: writeTo(dataDir), readFrom(dataDir, configPath, {}), cbi(makeReference<AsyncVar<ConfigBroadcastInterface>>()),
broadcaster(writeTo.getFollowerInterface()) {}
Future<Void> setup() { return setup(this); }
Future<Void> setup(ConfigClassSet const& configClassSet) { return setup(this, configClassSet); }
void restartNode() { writeTo.restartNode(); }
void changeBroadcaster() {
broadcastServer.cancel();
cbfi->set(ConfigBroadcastFollowerInterface{});
broadcastServer = broadcaster.serve(cbfi->get());
cbi->set(ConfigBroadcastInterface{});
readFrom.connectToBroadcaster(cbi);
broadcastServer =
broadcaster.registerWorker(readFrom.lastSeenVersion(), readFrom.configClassSet(), Never(), cbi->get());
}
Future<Void> restartLocalConfig(std::string const& newConfigPath) {
@ -395,7 +414,7 @@ public:
}
Future<Void> clear(Optional<KeyRef> configClass) { return writeTo.clear(configClass); }
Future<Void> check(Optional<int64_t> value) const { return readFrom.checkEventually(value); }
Future<Void> getError() const { return writeTo.getError() || readFrom.getError() || broadcastServer; }
Future<Void> getError() const { return writeTo.getError() || readFrom.getError() || broadcaster.getError(); }
};
// These functions give a common interface to all environments, to improve code reuse
@ -438,7 +457,7 @@ Future<Void> compact(BroadcasterToLocalConfigEnvironment& env) {
ACTOR template <class Env>
Future<Void> testRestartLocalConfig(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(check(env, int64_t{ 1 }));
wait(env.restartLocalConfig("class-A"));
@ -451,7 +470,7 @@ Future<Void> testRestartLocalConfig(UnitTestParameters params) {
ACTOR template <class Env>
Future<Void> testRestartLocalConfigAndChangeClass(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr, "class-B"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(check(env, int64_t{ 1 }));
wait(env.restartLocalConfig("class-B"));
@ -461,10 +480,30 @@ Future<Void> testRestartLocalConfigAndChangeClass(UnitTestParameters params) {
return Void();
}
ACTOR template <class Env>
Future<Void> testNewLocalConfigAfterCompaction(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(check(env, int64_t{ 1 }));
wait(compact(env));
// Erase the data dir to simulate a new worker joining the system after
// compaction.
platform::eraseDirectoryRecursive(params.getDataDir());
platform::createDirectory(params.getDataDir());
wait(env.restartLocalConfig("class-A"));
// Reregister worker with broadcaster.
env.changeBroadcaster();
wait(check(env, int64_t{ 1 }));
wait(set(env, "class-A"_sr, 2));
wait(check(env, int64_t{ 2 }));
return Void();
}
ACTOR template <class Env>
Future<Void> testSet(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(check(env, int64_t{ 1 }));
return Void();
@ -473,7 +512,7 @@ Future<Void> testSet(UnitTestParameters params) {
ACTOR template <class Env>
Future<Void> testClear(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(clear(env, "class-A"_sr));
wait(check(env, Optional<int64_t>{}));
@ -483,7 +522,7 @@ Future<Void> testClear(UnitTestParameters params) {
ACTOR template <class Env>
Future<Void> testGlobalSet(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, Optional<KeyRef>{}, int64_t{ 1 }));
wait(check(env, int64_t{ 1 }));
wait(set(env, "class-A"_sr, int64_t{ 10 }));
@ -494,7 +533,7 @@ Future<Void> testGlobalSet(UnitTestParameters params) {
ACTOR template <class Env>
Future<Void> testIgnore(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr, "class-B"_sr })));
wait(set(env, "class-B"_sr, int64_t{ 1 }));
choose {
when(wait(delay(5))) {}
@ -506,7 +545,7 @@ Future<Void> testIgnore(UnitTestParameters params) {
ACTOR template <class Env>
Future<Void> testCompact(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(compact(env));
wait(check(env, 1));
@ -518,7 +557,7 @@ Future<Void> testCompact(UnitTestParameters params) {
ACTOR template <class Env>
Future<Void> testChangeBroadcaster(UnitTestParameters params) {
state Env env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(check(env, int64_t{ 1 }));
env.changeBroadcaster();
@ -594,7 +633,7 @@ TEST_CASE("/fdbserver/ConfigDB/LocalConfiguration/GlobalSet") {
TEST_CASE("/fdbserver/ConfigDB/LocalConfiguration/ConflictingOverrides") {
state LocalConfigEnvironment env(params.getDataDir(), "class-A/class-B", {});
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr, "class-B"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
wait(set(env, "class-B"_sr, int64_t{ 10 }));
env.check(10);
@ -603,7 +642,7 @@ TEST_CASE("/fdbserver/ConfigDB/LocalConfiguration/ConflictingOverrides") {
TEST_CASE("/fdbserver/ConfigDB/LocalConfiguration/Manual") {
state LocalConfigEnvironment env(params.getDataDir(), "class-A", { { "test_long", "1000" } });
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
env.check(1000);
return Void();
@ -649,6 +688,11 @@ TEST_CASE("/fdbserver/ConfigDB/BroadcasterToLocalConfig/Compact") {
return Void();
}
TEST_CASE("/fdbserver/ConfigDB/BroadcasterToLocalConfig/RestartLocalConfigurationAfterCompaction") {
wait(testNewLocalConfigAfterCompaction<BroadcasterToLocalConfigEnvironment>(params));
return Void();
}
TEST_CASE("/fdbserver/ConfigDB/TransactionToLocalConfig/Set") {
wait(testSet<TransactionToLocalConfigEnvironment>(params));
return Void();
@ -666,7 +710,7 @@ TEST_CASE("/fdbserver/ConfigDB/TransactionToLocalConfig/GlobalSet") {
TEST_CASE("/fdbserver/ConfigDB/TransactionToLocalConfig/RestartNode") {
state TransactionToLocalConfigEnvironment env(params.getDataDir(), "class-A");
wait(env.setup());
wait(env.setup(ConfigClassSet({ "class-A"_sr })));
wait(set(env, "class-A"_sr, int64_t{ 1 }));
env.restartNode();
wait(check(env, int64_t{ 1 }));
@ -688,6 +732,11 @@ TEST_CASE("/fdbserver/ConfigDB/TransactionToLocalConfig/CompactNode") {
return Void();
}
TEST_CASE("/fdbserver/ConfigDB/TransactionToLocalConfig/RestartLocalConfigurationAfterCompaction") {
wait(testNewLocalConfigAfterCompaction<TransactionToLocalConfigEnvironment>(params));
return Void();
}
TEST_CASE("/fdbserver/ConfigDB/Transaction/Set") {
state TransactionEnvironment env(params.getDataDir());
wait(env.setup());
@ -763,3 +812,5 @@ TEST_CASE("/fdbserver/ConfigDB/Transaction/BadRangeRead") {
}
return Void();
}
// TODO: Test worker failure detection on ConfigBroadcaster

View File

@ -111,7 +111,7 @@ class ConfigNodeImpl {
Counter setMutations;
Counter clearMutations;
Counter getValueRequests;
Counter newVersionRequests;
Counter getGenerationRequests;
Future<Void> logger;
ACTOR static Future<ConfigGeneration> getGeneration(ConfigNodeImpl* self) {
@ -188,7 +188,7 @@ class ConfigNodeImpl {
wait(getMutations(self, req.lastSeenVersion + 1, committedVersion));
state Standalone<VectorRef<VersionedConfigCommitAnnotationRef>> versionedAnnotations =
wait(getAnnotations(self, req.lastSeenVersion + 1, committedVersion));
TraceEvent(SevDebug, "ConfigDatabaseNodeSendingChanges", self->id)
TraceEvent(SevDebug, "ConfigNodeSendingChanges", self->id)
.detail("ReqLastSeenVersion", req.lastSeenVersion)
.detail("CommittedVersion", committedVersion)
.detail("NumMutations", versionedMutations.size())
@ -203,6 +203,10 @@ class ConfigNodeImpl {
ACTOR static Future<Void> getNewGeneration(ConfigNodeImpl* self, ConfigTransactionGetGenerationRequest req) {
state ConfigGeneration generation = wait(getGeneration(self));
++generation.liveVersion;
if (req.lastSeenLiveVersion.present()) {
TEST(req.lastSeenLiveVersion.get() >= generation.liveVersion); // Node is lagging behind some other node
generation.liveVersion = std::max(generation.liveVersion, req.lastSeenLiveVersion.get() + 1);
}
self->kvStore->set(KeyValueRef(currentGenerationKey, BinaryWriter::toValue(generation, IncludeVersion())));
wait(self->kvStore->commit());
req.reply.send(ConfigTransactionGetGenerationReply{ generation });
@ -347,7 +351,7 @@ class ConfigNodeImpl {
loop {
choose {
when(ConfigTransactionGetGenerationRequest req = waitNext(cti->getGeneration.getFuture())) {
++self->newVersionRequests;
++self->getGenerationRequests;
wait(getNewGeneration(self, req));
}
when(ConfigTransactionGetRequest req = waitNext(cti->get.getFuture())) {
@ -380,7 +384,7 @@ class ConfigNodeImpl {
wait(store(reply.snapshotVersion, getLastCompactedVersion(self)));
wait(store(reply.changes, getMutations(self, reply.snapshotVersion + 1, req.mostRecentVersion)));
wait(store(reply.annotations, getAnnotations(self, reply.snapshotVersion + 1, req.mostRecentVersion)));
TraceEvent(SevDebug, "ConfigDatabaseNodeGettingSnapshot", self->id)
TraceEvent(SevDebug, "ConfigNodeGettingSnapshot", self->id)
.detail("SnapshotVersion", reply.snapshotVersion)
.detail("SnapshotSize", reply.snapshot.size())
.detail("ChangesSize", reply.changes.size())
@ -394,7 +398,7 @@ class ConfigNodeImpl {
// However, commit annotations for compacted mutations are lost
ACTOR static Future<Void> compact(ConfigNodeImpl* self, ConfigFollowerCompactRequest req) {
state Version lastCompactedVersion = wait(getLastCompactedVersion(self));
TraceEvent(SevDebug, "ConfigDatabaseNodeCompacting", self->id)
TraceEvent(SevDebug, "ConfigNodeCompacting", self->id)
.detail("Version", req.version)
.detail("LastCompacted", lastCompactedVersion);
if (req.version <= lastCompactedVersion) {
@ -413,7 +417,7 @@ class ConfigNodeImpl {
if (version > req.version) {
break;
} else {
TraceEvent(SevDebug, "ConfigDatabaseNodeCompactionApplyingMutation", self->id)
TraceEvent(SevDebug, "ConfigNodeCompactionApplyingMutation", self->id)
.detail("IsSet", mutation.isSet())
.detail("MutationVersion", version)
.detail("LastCompactedVersion", lastCompactedVersion)
@ -467,14 +471,13 @@ class ConfigNodeImpl {
public:
ConfigNodeImpl(std::string const& folder)
: id(deterministicRandom()->randomUniqueID()), kvStore(folder, id, "globalconf-"), cc("ConfigDatabaseNode"),
: id(deterministicRandom()->randomUniqueID()), kvStore(folder, id, "globalconf-"), cc("ConfigNode"),
compactRequests("CompactRequests", cc), successfulChangeRequests("SuccessfulChangeRequests", cc),
failedChangeRequests("FailedChangeRequests", cc), snapshotRequests("SnapshotRequests", cc),
getCommittedVersionRequests("GetCommittedVersionRequests", cc), successfulCommits("SuccessfulCommits", cc),
failedCommits("FailedCommits", cc), setMutations("SetMutations", cc), clearMutations("ClearMutations", cc),
getValueRequests("GetValueRequests", cc), newVersionRequests("NewVersionRequests", cc) {
logger = traceCounters(
"ConfigDatabaseNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigDatabaseNode");
getValueRequests("GetValueRequests", cc), getGenerationRequests("GetGenerationRequests", cc) {
logger = traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigNode");
TraceEvent(SevDebug, "StartingConfigNode", id).detail("KVStoreAlreadyExists", kvStore.exists());
}
@ -483,14 +486,14 @@ public:
Future<Void> serve(ConfigFollowerInterface const& cfi) { return serve(this, &cfi); }
};
ConfigNode::ConfigNode(std::string const& folder) : _impl(std::make_unique<ConfigNodeImpl>(folder)) {}
ConfigNode::ConfigNode(std::string const& folder) : impl(PImpl<ConfigNodeImpl>::create(folder)) {}
ConfigNode::~ConfigNode() = default;
Future<Void> ConfigNode::serve(ConfigTransactionInterface const& cti) {
return impl().serve(cti);
return impl->serve(cti);
}
Future<Void> ConfigNode::serve(ConfigFollowerInterface const& cfi) {
return impl().serve(cfi);
return impl->serve(cfi);
}

View File

@ -23,12 +23,11 @@
#include <string>
#include "fdbclient/ConfigTransactionInterface.h"
#include "fdbclient/PImpl.h"
#include "fdbserver/ConfigFollowerInterface.h"
class ConfigNode : public ReferenceCounted<ConfigNode> {
std::unique_ptr<class ConfigNodeImpl> _impl;
ConfigNodeImpl const& impl() const { return *_impl; }
ConfigNodeImpl& impl() { return *_impl; }
PImpl<class ConfigNodeImpl> impl;
public:
ConfigNode(std::string const& folder);

View File

@ -720,7 +720,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf,
ACTOR Future<Void> coordinationServer(std::string dataFolder,
Reference<ClusterConnectionFile> ccf,
UseConfigDB useConfigDB) {
ConfigDBType configDBType) {
state UID myID = deterministicRandom()->randomUniqueID();
state LeaderElectionRegInterface myLeaderInterface(g_network);
state GenerationRegInterface myInterface(g_network);
@ -733,7 +733,7 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder,
.detail("MyInterfaceAddr", myInterface.read.getEndpoint().getPrimaryAddress())
.detail("Folder", dataFolder);
if (useConfigDB != UseConfigDB::DISABLED) {
if (configDBType != ConfigDBType::DISABLED) {
configTransactionInterface.setupWellKnownEndpoints();
configFollowerInterface.setupWellKnownEndpoints();
configNode = makeReference<ConfigNode>(dataFolder);

View File

@ -234,6 +234,6 @@ public:
Future<Void> coordinationServer(std::string const& dataFolder,
Reference<ClusterConnectionFile> const& ccf,
UseConfigDB const& useConfigDB);
ConfigDBType const&);
#endif

View File

@ -2510,7 +2510,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
if (tss_info_by_pair.count(newServer.id())) {
tss_info_by_pair[newServer.id()]->onTSSPairRemoved = r->onRemoved;
if (tss_info_by_pair[newServer.id()]->wakeUpTracker.canBeSet()) {
tss_info_by_pair[newServer.id()]->wakeUpTracker.send(Void());
auto p = tss_info_by_pair[newServer.id()]->wakeUpTracker;
// This callback could delete tss_info_by_pair[newServer.id()], so use a copy
p.send(Void());
}
}
@ -5129,8 +5131,10 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
if (doRecruit && newServer.isError()) {
TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError());
if (!newServer.isError(error_code_recruitment_failed) &&
!newServer.isError(error_code_request_maybe_delivered))
!newServer.isError(error_code_request_maybe_delivered)) {
tssState->markComplete();
throw newServer.getError();
}
wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution));
}
@ -5248,8 +5252,13 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
}
}
int newTssToRecruit = targetTSSInDC - self->tss_info_by_pair.size() - inProgressTSSCount;
// FIXME: Should log this if the recruit count stays the same but the other numbers update?
if (newTssToRecruit != tssToRecruit) {
TraceEvent("TSS_RecruitUpdated", self->distributorId).detail("Count", newTssToRecruit);
TraceEvent("TSS_RecruitUpdated", self->distributorId)
.detail("Desired", targetTSSInDC)
.detail("Existing", self->tss_info_by_pair.size())
.detail("InProgress", inProgressTSSCount)
.detail("NotStarted", newTssToRecruit);
tssToRecruit = newTssToRecruit;
// if we need to get rid of some TSS processes, signal to either cancel recruitment or kill existing TSS

View File

@ -19,6 +19,7 @@
*/
#include "fdbclient/Notified.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/LogSystemDiskQueueAdapter.h"
#include "fdbclient/CommitProxyInterface.h"
@ -397,8 +398,11 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
GrvProxyStats* stats,
GrvTransactionRateInfo* batchRateInfo,
TransactionTagMap<uint64_t>* transactionTagCounter) {
getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
loop choose {
when(GetReadVersionRequest req = waitNext(readVersionRequests)) {
// auto lineage = make_scoped_lineage(&TransactionLineage::txID, req.spanContext.first());
// getCurrentLineage()->modify(&TransactionLineage::txID) =
// WARNING: this code is run at a high priority, so it needs to do as little work as possible
bool canBeQueued = true;
if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() >
@ -720,6 +724,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
state Span span;
state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
addActor.send(monitorDDMetricsChanges(&midShardSize, db));
addActor.send(getRate(proxy.id(),

View File

@ -88,6 +88,7 @@ rocksdb::Options getOptions() {
}
options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbOpts));
options.db_log_dir = SERVER_KNOBS->LOG_DIRECTORY;
return options;
}

View File

@ -20,7 +20,6 @@
#include "fdbclient/IKnobCollection.h"
#include "fdbrpc/Stats.h"
#include "fdbserver/ConfigBroadcastFollowerInterface.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/LocalConfiguration.h"
#include "fdbserver/OnDemandStore.h"
@ -166,7 +165,6 @@ class LocalConfigurationImpl {
}
CounterCollection cc;
Counter broadcasterChanges;
Counter snapshots;
Counter changeRequestsFetched;
Counter mutations;
@ -283,39 +281,27 @@ class LocalConfigurationImpl {
return Void();
}
ACTOR static Future<Void> consumeInternal(LocalConfigurationImpl* self,
ConfigBroadcastFollowerInterface broadcaster) {
ACTOR static Future<Void> consumeInternal(LocalConfigurationImpl* self, ConfigBroadcastInterface broadcaster) {
loop {
try {
state ConfigBroadcastFollowerGetChangesReply changesReply =
wait(broadcaster.getChanges.getReply(ConfigBroadcastFollowerGetChangesRequest{
self->lastSeenVersion, self->configKnobOverrides.getConfigClassSet() }));
TraceEvent(SevDebug, "LocalConfigGotChanges", self->id)
.detail("Size", changesReply.changes.size())
.detail("Version", changesReply.mostRecentVersion);
wait(self->addChanges(changesReply.changes, changesReply.mostRecentVersion));
} catch (Error& e) {
if (e.code() == error_code_version_already_compacted) {
state ConfigBroadcastFollowerGetSnapshotReply snapshotReply = wait(broadcaster.getSnapshot.getReply(
ConfigBroadcastFollowerGetSnapshotRequest{ self->configKnobOverrides.getConfigClassSet() }));
ASSERT_GT(snapshotReply.version, self->lastSeenVersion);
choose {
when(ConfigBroadcastSnapshotRequest snapshotReq = waitNext(broadcaster.snapshot.getFuture())) {
ASSERT_GT(snapshotReq.version, self->lastSeenVersion);
++self->snapshots;
wait(setSnapshot(self, std::move(snapshotReply.snapshot), snapshotReply.version));
} else {
throw e;
wait(setSnapshot(self, std::move(snapshotReq.snapshot), snapshotReq.version));
}
when(state ConfigBroadcastChangesRequest req = waitNext(broadcaster.changes.getFuture())) {
wait(self->addChanges(req.changes, req.mostRecentVersion));
req.reply.send(ConfigBroadcastChangesReply());
}
}
wait(yield()); // Necessary to not immediately trigger retry?
}
}
ACTOR static Future<Void> consume(LocalConfigurationImpl* self,
Reference<IAsyncListener<ConfigBroadcastFollowerInterface> const> broadcaster) {
ACTOR static Future<Void> consume(LocalConfigurationImpl* self, ConfigBroadcastInterface broadcaster) {
ASSERT(self->initFuture.isValid() && self->initFuture.isReady());
loop {
choose {
when(wait(brokenPromiseToNever(consumeInternal(self, broadcaster->get())))) { ASSERT(false); }
when(wait(broadcaster->onChange())) { ++self->broadcasterChanges; }
when(wait(consumeInternal(self, broadcaster))) { ASSERT(false); }
when(wait(self->kvStore->getError())) { ASSERT(false); }
}
}
@ -328,8 +314,7 @@ public:
IsTest isTest)
: id(deterministicRandom()->randomUniqueID()), kvStore(dataFolder, id, "localconf-"),
configKnobOverrides(configPath), manualKnobOverrides(manualKnobOverrides), cc("LocalConfiguration"),
broadcasterChanges("BroadcasterChanges", cc), snapshots("Snapshots", cc),
changeRequestsFetched("ChangeRequestsFetched", cc), mutations("Mutations", cc) {
snapshots("Snapshots", cc), changeRequestsFetched("ChangeRequestsFetched", cc), mutations("Mutations", cc) {
if (isTest) {
testKnobCollection =
IKnobCollection::create(IKnobCollection::Type::TEST,
@ -370,12 +355,16 @@ public:
return getKnobs().getTestKnobs();
}
Future<Void> consume(Reference<IAsyncListener<ConfigBroadcastFollowerInterface> const> const& broadcaster) {
return consume(this, broadcaster);
Future<Void> consume(ConfigBroadcastInterface const& broadcastInterface) {
return consume(this, broadcastInterface);
}
UID getID() const { return id; }
Version getLastSeenVersion() const { return lastSeenVersion; }
ConfigClassSet configClassSet() const { return configKnobOverrides.getConfigClassSet(); }
static void testManualKnobOverridesInvalidName() {
std::map<std::string, std::string> invalidOverrides;
invalidOverrides["knob_name_that_does_not_exist"] = "";
@ -423,46 +412,49 @@ LocalConfiguration::LocalConfiguration(std::string const& dataFolder,
std::string const& configPath,
std::map<std::string, std::string> const& manualKnobOverrides,
IsTest isTest)
: _impl(std::make_unique<LocalConfigurationImpl>(dataFolder, configPath, manualKnobOverrides, isTest)) {}
LocalConfiguration::LocalConfiguration(LocalConfiguration&&) = default;
LocalConfiguration& LocalConfiguration::operator=(LocalConfiguration&&) = default;
: impl(PImpl<LocalConfigurationImpl>::create(dataFolder, configPath, manualKnobOverrides, isTest)) {}
LocalConfiguration::~LocalConfiguration() = default;
Future<Void> LocalConfiguration::initialize() {
return impl().initialize();
return impl->initialize();
}
FlowKnobs const& LocalConfiguration::getFlowKnobs() const {
return impl().getFlowKnobs();
return impl->getFlowKnobs();
}
ClientKnobs const& LocalConfiguration::getClientKnobs() const {
return impl().getClientKnobs();
return impl->getClientKnobs();
}
ServerKnobs const& LocalConfiguration::getServerKnobs() const {
return impl().getServerKnobs();
return impl->getServerKnobs();
}
TestKnobs const& LocalConfiguration::getTestKnobs() const {
return impl().getTestKnobs();
return impl->getTestKnobs();
}
Future<Void> LocalConfiguration::consume(
Reference<IAsyncListener<ConfigBroadcastFollowerInterface> const> const& broadcaster) {
return impl().consume(broadcaster);
Future<Void> LocalConfiguration::consume(ConfigBroadcastInterface const& broadcaster) {
return impl->consume(broadcaster);
}
Future<Void> LocalConfiguration::addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> changes,
Version mostRecentVersion) {
return impl().addChanges(changes, mostRecentVersion);
return impl->addChanges(changes, mostRecentVersion);
}
UID LocalConfiguration::getID() const {
return impl().getID();
return impl->getID();
}
Version LocalConfiguration::lastSeenVersion() const {
return impl->getLastSeenVersion();
}
ConfigClassSet LocalConfiguration::configClassSet() const {
return impl->configClassSet();
}
TEST_CASE("/fdbserver/ConfigDB/ManualKnobOverrides/InvalidName") {

View File

@ -24,7 +24,8 @@
#include "fdbclient/ConfigKnobs.h"
#include "fdbclient/IKnobCollection.h"
#include "fdbserver/ConfigBroadcastFollowerInterface.h"
#include "fdbclient/PImpl.h"
#include "fdbserver/ConfigBroadcastInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/Arena.h"
#include "flow/Knobs.h"
@ -42,26 +43,24 @@ FDB_DECLARE_BOOLEAN_PARAM(IsTest);
* - Register with the broadcaster to receive new updates for the relevant configuration classes
* - Persist these updates when received, and restart if necessary
*/
class LocalConfiguration {
std::unique_ptr<class LocalConfigurationImpl> _impl;
LocalConfigurationImpl& impl() { return *_impl; }
LocalConfigurationImpl const& impl() const { return *_impl; }
class LocalConfiguration : public ReferenceCounted<LocalConfiguration> {
PImpl<class LocalConfigurationImpl> impl;
public:
LocalConfiguration(std::string const& dataFolder,
std::string const& configPath,
std::map<std::string, std::string> const& manualKnobOverrides,
IsTest = IsTest::False);
LocalConfiguration(LocalConfiguration&&);
LocalConfiguration& operator=(LocalConfiguration&&);
~LocalConfiguration();
Future<Void> initialize();
FlowKnobs const& getFlowKnobs() const;
ClientKnobs const& getClientKnobs() const;
ServerKnobs const& getServerKnobs() const;
TestKnobs const& getTestKnobs() const;
Future<Void> consume(Reference<IAsyncListener<ConfigBroadcastFollowerInterface> const> const& broadcaster);
Future<Void> consume(ConfigBroadcastInterface const& broadcastInterface);
UID getID() const;
Version lastSeenVersion() const;
ConfigClassSet configClassSet() const;
public: // Testing
Future<Void> addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> versionedMutations,

View File

@ -117,6 +117,7 @@ struct LogRouterData {
getMoreBlockedCount; // Increase by 1 if data is not available when LR tries to pull data from satellite tLog.
Future<Void> logger;
Reference<EventCacheHolder> eventCacheHolder;
int activePeekStreams = 0;
std::vector<Reference<TagData>> tag_data; // we only store data for the remote tag locality
@ -193,6 +194,7 @@ struct LogRouterData {
return int64_t(1000 * val);
});
specialCounter(cc, "Generation", [this]() { return this->generation; });
specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; });
logger = traceCounters("LogRouterMetrics",
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
@ -404,18 +406,15 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>>& get_version_messages(Lo
return tagData->version_messages;
};
void peekMessagesFromMemory(LogRouterData* self,
TLogPeekRequest const& req,
BinaryWriter& messages,
Version& endVersion) {
void peekMessagesFromMemory(LogRouterData* self, Tag tag, Version begin, BinaryWriter& messages, Version& endVersion) {
ASSERT(!messages.getLength());
auto& deque = get_version_messages(self, req.tag);
auto& deque = get_version_messages(self, tag);
//TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size());
auto it = std::lower_bound(deque.begin(),
deque.end(),
std::make_pair(req.begin, LengthPrefixedStringRef()),
std::make_pair(begin, LengthPrefixedStringRef()),
CompareFirst<std::pair<Version, LengthPrefixedStringRef>>());
Version currentVersion = -1;
@ -442,22 +441,30 @@ Version poppedVersion(LogRouterData* self, Tag tag) {
return tagData->popped;
}
ACTOR Future<Void> logRouterPeekMessages(LogRouterData* self, TLogPeekRequest req) {
// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request
ACTOR template <typename PromiseType>
Future<Void> logRouterPeekMessages(PromiseType replyPromise,
LogRouterData* self,
Version reqBegin,
Tag reqTag,
bool reqReturnIfBlocked = false,
bool reqOnlySpilled = false,
Optional<std::pair<UID, int>> reqSequence = Optional<std::pair<UID, int>>()) {
state BinaryWriter messages(Unversioned());
state int sequence = -1;
state UID peekId;
if (req.sequence.present()) {
if (reqSequence.present()) {
try {
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
peekId = reqSequence.get().first;
sequence = reqSequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS &&
self->peekTracker.find(peekId) == self->peekTracker.end()) {
throw operation_obsolete();
}
auto& trackerData = self->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
// The peek cursor and this comparison need to agree about the maximum number of in-flight requests.
@ -476,12 +483,12 @@ ACTOR Future<Void> logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = prevPeekData.first;
req.onlySpilled = prevPeekData.second;
reqBegin = prevPeekData.first;
reqOnlySpilled = prevPeekData.second;
wait(yield());
} catch (Error& e) {
if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
replyPromise.sendError(e);
return Void();
} else {
throw;
@ -489,62 +496,62 @@ ACTOR Future<Void> logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re
}
}
//TraceEvent("LogRouterPeek1", self->dbgid).detail("From", req.reply.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", req.begin);
if (req.returnIfBlocked && self->version.get() < req.begin) {
//TraceEvent("LogRouterPeek1", self->dbgid).detail("From", replyPromise.getEndpoint().getPrimaryAddress()).detail("Ver", self->version.get()).detail("Begin", reqBegin);
if (reqReturnIfBlocked && self->version.get() < reqBegin) {
//TraceEvent("LogRouterPeek2", self->dbgid);
req.reply.sendError(end_of_stream());
if (req.sequence.present()) {
replyPromise.sendError(end_of_stream());
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(req.begin, req.onlySpilled));
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
}
}
return Void();
}
if (self->version.get() < req.begin) {
wait(self->version.whenAtLeast(req.begin));
if (self->version.get() < reqBegin) {
wait(self->version.whenAtLeast(reqBegin));
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
}
Version poppedVer = poppedVersion(self, req.tag);
Version poppedVer = poppedVersion(self, reqTag);
if (poppedVer > req.begin || req.begin < self->startVersion) {
if (poppedVer > reqBegin || reqBegin < self->startVersion) {
// This should only happen if a packet is sent multiple times and the reply is not needed.
// Since we are using popped differently, do not send a reply.
TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid)
.detail("Begin", req.begin)
.detail("Begin", reqBegin)
.detail("Popped", poppedVer)
.detail("Start", self->startVersion);
req.reply.send(Never());
if (req.sequence.present()) {
replyPromise.send(Never());
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(req.begin, req.onlySpilled));
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
}
}
return Void();
}
Version endVersion = self->version.get() + 1;
peekMessagesFromMemory(self, req, messages, endVersion);
peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion);
TLogPeekReply reply;
reply.maxKnownVersion = self->version.get();
reply.minKnownCommittedVersion = self->poppedVersion;
reply.messages = messages.toValue();
reply.messages = StringRef(reply.arena, messages.toValue());
reply.popped = self->minPopped.get() >= self->startVersion ? self->minPopped.get() : 0;
reply.end = endVersion;
reply.onlySpilled = false;
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
trackerData.lastUpdate = now();
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
@ -552,20 +559,60 @@ ACTOR Future<Void> logRouterPeekMessages(LogRouterData* self, TLogPeekRequest re
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
}
reply.begin = req.begin;
reply.begin = reqBegin;
}
req.reply.send(reply);
replyPromise.send(reply);
//TraceEvent("LogRouterPeek4", self->dbgid);
return Void();
}
// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover
ACTOR Future<Void> logRouterPeekStream(LogRouterData* self, TLogPeekStreamRequest req) {
self->activePeekStreams++;
state Version begin = req.begin;
state bool onlySpilled = false;
req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes));
loop {
state TLogPeekStreamReply reply;
state Promise<TLogPeekReply> promise;
state Future<TLogPeekReply> future(promise.getFuture());
try {
wait(req.reply.onReady() && store(reply.rep, future) &&
logRouterPeekMessages(promise, self, begin, req.tag, req.returnIfBlocked, onlySpilled));
reply.rep.begin = begin;
req.reply.send(reply);
begin = reply.rep.end;
onlySpilled = reply.rep.onlySpilled;
if (reply.rep.end > self->version.get()) {
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
} else {
wait(delay(0, g_network->getCurrentTask()));
}
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
}
}
}
}
ACTOR Future<Void> cleanupPeekTrackers(LogRouterData* self) {
loop {
double minTimeUntilExpiration = SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME;
@ -643,7 +690,13 @@ ACTOR Future<Void> logRouterCore(TLogInterface interf,
logRouterData.logSystem->set(ILogSystem::fromServerDBInfo(logRouterData.dbgid, db->get(), true));
}
when(TLogPeekRequest req = waitNext(interf.peekMessages.getFuture())) {
addActor.send(logRouterPeekMessages(&logRouterData, req));
addActor.send(logRouterPeekMessages(
req.reply, &logRouterData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence));
}
when(TLogPeekStreamRequest req = waitNext(interf.peekStreamMessages.getFuture())) {
TraceEvent(SevDebug, "LogRouterPeekStream", logRouterData.dbgid)
.detail("Token", interf.peekStreamMessages.getEndpoint().token);
addActor.send(logRouterPeekStream(&logRouterData, req));
}
when(TLogPopRequest req = waitNext(interf.popMessages.getFuture())) {
// Request from remote tLog to pop data from LR

View File

@ -427,7 +427,7 @@ struct ILogSystem {
TLogPeekReply results;
ArenaReader rd;
LogMessageVersion messageVersion, end;
LogMessageVersion messageVersion, end; // the version of current message; the intended end version of current cursor
Version poppedVersion;
TagsAndMessage messageAndTags;
bool hasMsg;
@ -437,9 +437,11 @@ struct ILogSystem {
bool onlySpilled;
bool parallelGetMore;
bool usePeekStream;
int sequence;
Deque<Future<TLogPeekReply>> futureResults;
Future<Void> interfaceChanged;
Optional<ReplyPromiseStream<TLogPeekStreamReply>> peekReplyStream;
double lastReset;
Future<Void> resetCheck;

View File

@ -25,6 +25,24 @@
#include "fdbrpc/ReplicationUtils.h"
#include "flow/actorcompiler.h" // has to be last include
// create a peek stream for cursor when it's possible
ACTOR Future<Void> tryEstablishPeekStream(ILogSystem::ServerPeekCursor* self) {
if (self->peekReplyStream.present())
return Void();
else if (!self->interf || !self->interf->get().present()) {
self->peekReplyStream.reset();
return Never();
}
wait(IFailureMonitor::failureMonitor().onStateEqual(self->interf->get().interf().peekStreamMessages.getEndpoint(),
FailureStatus(false)));
self->peekReplyStream = self->interf->get().interf().peekStreamMessages.getReplyStream(TLogPeekStreamRequest(
self->messageVersion.version, self->tag, self->returnIfBlocked, std::numeric_limits<int>::max()));
TraceEvent(SevDebug, "SPC_StreamCreated", self->randomID)
.detail("PeerAddr", self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress())
.detail("PeerToken", self->interf->get().interf().peekStreamMessages.getEndpoint().token);
return Void();
}
ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference<AsyncVar<OptionalInterface<TLogInterface>>> const& interf,
Tag tag,
Version begin,
@ -33,11 +51,15 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(Reference<AsyncVar<OptionalInterf
bool parallelGetMore)
: interf(interf), tag(tag), rd(results.arena, results.messages, Unversioned()), messageVersion(begin), end(end),
poppedVersion(0), hasMsg(false), randomID(deterministicRandom()->randomUniqueID()),
returnIfBlocked(returnIfBlocked), onlySpilled(false), parallelGetMore(parallelGetMore), sequence(0), lastReset(0),
resetCheck(Void()), slowReplies(0), fastReplies(0), unknownReplies(0) {
returnIfBlocked(returnIfBlocked), onlySpilled(false), parallelGetMore(parallelGetMore),
usePeekStream(SERVER_KNOBS->PEEK_USING_STREAMING), sequence(0), lastReset(0), resetCheck(Void()), slowReplies(0),
fastReplies(0), unknownReplies(0) {
this->results.maxKnownVersion = 0;
this->results.minKnownCommittedVersion = 0;
//TraceEvent("SPC_Starting", randomID).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end).backtrace();
DisabledTraceEvent(SevDebug, "SPC_Starting", randomID)
.detail("Tag", tag.toString())
.detail("Begin", begin)
.detail("End", end);
}
ILogSystem::ServerPeekCursor::ServerPeekCursor(TLogPeekReply const& results,
@ -50,8 +72,8 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(TLogPeekReply const& results,
: tag(tag), results(results), rd(results.arena, results.messages, Unversioned()), messageVersion(messageVersion),
end(end), poppedVersion(poppedVersion), messageAndTags(message), hasMsg(hasMsg),
randomID(deterministicRandom()->randomUniqueID()), returnIfBlocked(false), onlySpilled(false),
parallelGetMore(false), sequence(0), lastReset(0), resetCheck(Void()), slowReplies(0), fastReplies(0),
unknownReplies(0) {
parallelGetMore(false), usePeekStream(false), sequence(0), lastReset(0), resetCheck(Void()), slowReplies(0),
fastReplies(0), unknownReplies(0) {
//TraceEvent("SPC_Clone", randomID);
this->results.maxKnownVersion = 0;
this->results.minKnownCommittedVersion = 0;
@ -111,8 +133,7 @@ void ILogSystem::ServerPeekCursor::nextMessage() {
}
messageAndTags.loadFromArena(&rd, &messageVersion.sub);
DEBUG_TAGS_AND_MESSAGE("ServerPeekCursor", messageVersion.version, messageAndTags.getRawMessage())
.detail("CursorID", this->randomID);
DEBUG_TAGS_AND_MESSAGE("ServerPeekCursor", messageVersion.version, messageAndTags.getRawMessage(), this->randomID);
// Rewind and consume the header so that reader() starts from the message.
rd.rewind();
rd.readBytes(messageAndTags.getHeaderSize());
@ -154,6 +175,20 @@ void ILogSystem::ServerPeekCursor::advanceTo(LogMessageVersion n) {
}
}
// This function is called after the cursor received one TLogPeekReply to update its members, which is the common logic
// in getMore helper functions.
void updateCursorWithReply(ILogSystem::ServerPeekCursor* self, const TLogPeekReply& res) {
self->results = res;
self->onlySpilled = res.onlySpilled;
if (res.popped.present())
self->poppedVersion = std::min(std::max(self->poppedVersion, res.popped.get()), self->end.version);
self->rd = ArenaReader(self->results.arena, self->results.messages, Unversioned());
LogMessageVersion skipSeq = self->messageVersion;
self->hasMsg = true;
self->nextMessage();
self->advanceTo(skipSeq);
}
ACTOR Future<Void> resetChecker(ILogSystem::ServerPeekCursor* self, NetworkAddress addr) {
self->slowReplies = 0;
self->unknownReplies = 0;
@ -209,11 +244,10 @@ ACTOR Future<TLogPeekReply> recordRequestMetrics(ILogSystem::ServerPeekCursor* s
}
ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) {
if (!self->interf || self->messageVersion >= self->end) {
if (!self->interf || self->isExhausted()) {
if (self->hasMessage())
return Void();
wait(Future<Void>(Never()));
throw internal_error();
return Never();
}
if (!self->interfaceChanged.isValid()) {
@ -254,16 +288,7 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
}
expectedBegin = res.end;
self->futureResults.pop_front();
self->results = res;
self->onlySpilled = res.onlySpilled;
if (res.popped.present())
self->poppedVersion =
std::min(std::max(self->poppedVersion, res.popped.get()), self->end.version);
self->rd = ArenaReader(self->results.arena, self->results.messages, Unversioned());
LogMessageVersion skipSeq = self->messageVersion;
self->hasMsg = true;
self->nextMessage();
self->advanceTo(skipSeq);
updateCursorWithReply(self, res);
//TraceEvent("SPC_GetMoreB", self->randomID).detail("Has", self->hasMessage()).detail("End", res.end).detail("Popped", res.popped.present() ? res.popped.get() : 0);
return Void();
}
@ -297,10 +322,70 @@ ACTOR Future<Void> serverPeekParallelGetMore(ILogSystem::ServerPeekCursor* self,
}
}
ACTOR Future<Void> serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) {
if (!self->interf || self->isExhausted()) {
self->peekReplyStream.reset();
if (self->hasMessage())
return Void();
return Never();
}
loop {
try {
state Version expectedBegin = self->messageVersion.version;
state Future<TLogPeekReply> fPeekReply = self->peekReplyStream.present()
? map(waitAndForward(self->peekReplyStream.get().getFuture()),
[](const TLogPeekStreamReply& r) { return r.rep; })
: Never();
choose {
when(wait(self->peekReplyStream.present() ? Never() : tryEstablishPeekStream(self))) {}
when(wait(self->interf->onChange())) {
self->onlySpilled = false;
self->peekReplyStream.reset();
}
when(TLogPeekReply res = wait(
self->peekReplyStream.present()
? recordRequestMetrics(
self,
self->interf->get().interf().peekStreamMessages.getEndpoint().getPrimaryAddress(),
fPeekReply)
: Never())) {
if (res.begin.get() != expectedBegin) {
throw operation_obsolete();
}
updateCursorWithReply(self, res);
expectedBegin = res.end;
DisabledTraceEvent(SevDebug, "SPC_GetMoreB", self->randomID)
.detail("Has", self->hasMessage())
.detail("End", res.end)
.detail("Popped", res.popped.present() ? res.popped.get() : 0);
// NOTE: delay is necessary here since ReplyPromiseStream delivers reply on high priority. Here we
// change the priority to the intended one.
wait(delay(0, taskID));
return Void();
}
}
} catch (Error& e) {
DisabledTraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).error(e, true);
if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) {
// NOTE: delay in order to avoid the endless retry loop block other tasks
self->peekReplyStream.reset();
wait(delay(0));
} else if (e.code() == error_code_end_of_stream) {
self->peekReplyStream.reset();
self->end.reset(self->messageVersion.version);
return Void();
} else {
throw;
}
}
}
}
ACTOR Future<Void> serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPriority taskID) {
if (!self->interf || self->messageVersion >= self->end) {
wait(Future<Void>(Never()));
throw internal_error();
if (!self->interf || self->isExhausted()) {
return Never();
}
try {
loop {
@ -314,16 +399,7 @@ ACTOR Future<Void> serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri
self->onlySpilled),
taskID))
: Never())) {
self->results = res;
self->onlySpilled = res.onlySpilled;
if (res.popped.present())
self->poppedVersion =
std::min(std::max(self->poppedVersion, res.popped.get()), self->end.version);
self->rd = ArenaReader(self->results.arena, self->results.messages, Unversioned());
LogMessageVersion skipSeq = self->messageVersion;
self->hasMsg = true;
self->nextMessage();
self->advanceTo(skipSeq);
updateCursorWithReply(self, res);
//TraceEvent("SPC_GetMoreB", self->randomID).detail("Has", self->hasMessage()).detail("End", res.end).detail("Popped", res.popped.present() ? res.popped.get() : 0);
return Void();
}
@ -340,11 +416,17 @@ ACTOR Future<Void> serverPeekGetMore(ILogSystem::ServerPeekCursor* self, TaskPri
}
Future<Void> ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) {
//TraceEvent("SPC_GetMore", randomID).detail("HasMessage", hasMessage()).detail("More", !more.isValid() || more.isReady()).detail("MessageVersion", messageVersion.toString()).detail("End", end.toString());
// TraceEvent("SPC_GetMore", randomID)
// .detail("HasMessage", hasMessage())
// .detail("More", !more.isValid() || more.isReady())
// .detail("MessageVersion", messageVersion.toString())
// .detail("End", end.toString());
if (hasMessage() && !parallelGetMore)
return Void();
if (!more.isValid() || more.isReady()) {
if (parallelGetMore || onlySpilled || futureResults.size()) {
if (usePeekStream && (tag.locality >= 0 || tag.locality == tagLocalityLogRouter || tag.locality == tagLocalityRemoteLog)) {
more = serverPeekStreamGetMore(this, taskID);
} else if (parallelGetMore || onlySpilled || futureResults.size()) {
more = serverPeekParallelGetMore(this, taskID);
} else {
more = serverPeekGetMore(this, taskID);
@ -362,6 +444,12 @@ ACTOR Future<Void> serverPeekOnFailed(ILogSystem::ServerPeekCursor* self) {
: Never())) {
return Void();
}
when(wait(self->interf->get().present()
? IFailureMonitor::failureMonitor().onStateEqual(
self->interf->get().interf().peekStreamMessages.getEndpoint(), FailureStatus())
: Never())) {
return Void();
}
when(wait(self->interf->onChange())) {}
}
}
@ -374,9 +462,14 @@ Future<Void> ILogSystem::ServerPeekCursor::onFailed() {
bool ILogSystem::ServerPeekCursor::isActive() const {
if (!interf->get().present())
return false;
if (messageVersion >= end)
if (isExhausted())
return false;
return IFailureMonitor::failureMonitor().getState(interf->get().interf().peekMessages.getEndpoint()).isAvailable();
return IFailureMonitor::failureMonitor()
.getState(interf->get().interf().peekMessages.getEndpoint())
.isAvailable() &&
IFailureMonitor::failureMonitor()
.getState(interf->get().interf().peekStreamMessages.getEndpoint())
.isAvailable();
}
bool ILogSystem::ServerPeekCursor::isExhausted() const {

View File

@ -635,9 +635,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
state int retries = 0;
state FlowLock::Releaser releaser;
state std::vector<std::pair<UID, UID>> tssToKill;
state std::unordered_set<UID> tssToIgnore;
// try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster
// try waiting for tss for a 2 loops, give up if they're behind to not affect the rest of the cluster
state int waitForTSSCounter = 2;
ASSERT(!destinationTeam.empty());
@ -657,22 +656,6 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
// printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str());
loop {
try {
if (tssToKill.size()) {
TEST(true); // killing TSS because they were unavailable for movekeys
// Kill tss BEFORE committing main txn so that client requests don't make it to the tss when it
// has a different shard set than its pair use a different RYW transaction since i'm too lazy
// (and don't want to add bugs) by changing whole method to RYW. Also, using a different
// transaction makes it commit earlier which we may need to guarantee causality of tss getting
// removed before client sends a request to this key range on the new SS
wait(removeTSSPairsFromCluster(occ, tssToKill));
for (auto& tssPair : tssToKill) {
TraceEvent(SevWarnAlways, "TSS_KillMoveKeys").detail("TSSID", tssPair.second);
tssToIgnore.insert(tssPair.second);
}
tssToKill.clear();
}
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@ -815,8 +798,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
// Wait for a durable quorum of servers in destServers to have keys available (readWrite)
// They must also have at least the transaction read version so they can't "forget" the shard
// between
// now and when this transaction commits.
// between now and when this transaction commits.
state vector<Future<Void>> serverReady; // only for count below
state vector<Future<Void>> tssReady; // for waiting in parallel with tss
state vector<StorageServerInterface> tssReadyInterfs;
@ -877,8 +859,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
TaskPriority::MoveKeys));
// Check to see if we're waiting only on tss. If so, decrement the waiting counter.
// If the waiting counter is zero, kill the slow/non-responsive tss processes before finalizing the
// data move.
// If the waiting counter is zero, ignore the slow/non-responsive tss processes before finalizing
// the data move.
if (tssReady.size()) {
bool allSSDone = true;
for (auto& f : serverReady) {
@ -902,12 +884,9 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
if (anyTssNotDone && waitForTSSCounter == 0) {
for (int i = 0; i < tssReady.size(); i++) {
if (!tssReady[i].isReady() || tssReady[i].isError()) {
tssToKill.push_back(
std::pair(tssReadyInterfs[i].tssPairID.get(), tssReadyInterfs[i].id()));
tssToIgnore.insert(tssReadyInterfs[i].id());
}
}
// repeat loop and go back to start to kill tss' before continuing on
continue;
}
}
}
@ -920,22 +899,11 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
for (int s = 0; s < tssReady.size(); s++)
tssCount += tssReady[s].isReady() && !tssReady[s].isError();
/*if (tssReady.size()) {
printf(" fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n",
begin.toString().c_str(),
keys.end.toString().c_str(),
count,
serverReady.size(),
tssCount,
tssReady.size());
} else {
printf(" fMK: [%s - %s) moved data to %d/%d servers\n",
begin.toString().c_str(),
keys.end.toString().c_str(),
count,
serverReady.size());
}*/
TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count);
TraceEvent readyServersEv(SevDebug, waitInterval.end(), relocationIntervalId);
readyServersEv.detail("ReadyServers", count);
if (tssReady.size()) {
readyServersEv.detail("ReadyTSS", tssCount);
}
if (count == dest.size()) {
// update keyServers, serverKeys

View File

@ -18,7 +18,9 @@
* limitations under the License.
*/
#include <algorithm>
#include <vector>
#include "fdbclient/FDBTypes.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/SpanContextMessage.h"
@ -27,43 +29,55 @@
#error "You cannot use mutation tracking in a clean/release build."
#endif
// Track up to 2 keys in simulation via enabling MUTATION_TRACKING_ENABLED and setting the keys here.
StringRef debugKey = LiteralStringRef("");
StringRef debugKey2 = LiteralStringRef("\xff\xff\xff\xff");
// If MUTATION_TRACKING_ENABLED is set, MutationTracking events will be logged for the
// keys in debugKeys and the ranges in debugRanges.
// Each entry is a pair of (label, keyOrRange) and the Label will be attached to the
// MutationTracking TraceEvent for easier searching/recognition.
std::vector<std::pair<const char *, KeyRef>> debugKeys = {
{"SomeKey", "foo"_sr}
};
std::vector<std::pair<const char *, KeyRangeRef>> debugRanges = {
{"Everything", {""_sr, "\xff\xff\xff\xff"_sr}}
};
TraceEvent debugMutationEnabled(const char* context, Version version, MutationRef const& mutation) {
if ((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) &&
((mutation.param1 <= debugKey && mutation.param2 > debugKey) ||
(mutation.param1 <= debugKey2 && mutation.param2 > debugKey2))) {
TraceEvent event("MutationTracking");
event.detail("At", context)
.detail("Version", version)
.detail("MutationType", typeString[mutation.type])
.detail("KeyBegin", mutation.param1)
.detail("KeyEnd", mutation.param2);
return event;
} else if (mutation.param1 == debugKey || mutation.param1 == debugKey2) {
TraceEvent event("MutationTracking");
event.detail("At", context)
.detail("Version", version)
.detail("MutationType", typeString[mutation.type])
.detail("Key", mutation.param1)
.detail("Value", mutation.param2);
return event;
} else {
return TraceEvent();
TraceEvent debugMutationEnabled(const char* context, Version version, MutationRef const& mutation, UID id) {
const char *label = nullptr;
for(auto &labelKey : debugKeys) {
if(((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) &&
KeyRangeRef(mutation.param1, mutation.param2).contains(labelKey.second)) ||
mutation.param1 == labelKey.second) {
label = labelKey.first;
break;
}
}
for(auto &labelRange : debugRanges) {
if(((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) &&
KeyRangeRef(mutation.param1, mutation.param2).intersects(labelRange.second)) ||
labelRange.second.contains(mutation.param1)) {
label = labelRange.first;
break;
}
}
if(label != nullptr) {
TraceEvent event("MutationTracking", id);
event.detail("Label", label)
.detail("At", context)
.detail("Version", version)
.detail("Mutation", mutation);
return event;
}
return TraceEvent();
}
TraceEvent debugKeyRangeEnabled(const char* context, Version version, KeyRangeRef const& keys) {
if (keys.contains(debugKey) || keys.contains(debugKey2)) {
return debugMutation(context, version, MutationRef(MutationRef::DebugKeyRange, keys.begin, keys.end));
} else {
return TraceEvent();
}
TraceEvent debugKeyRangeEnabled(const char* context, Version version, KeyRangeRef const& keys, UID id) {
return debugMutation(context, version, MutationRef(MutationRef::DebugKeyRange, keys.begin, keys.end), id);
}
TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, StringRef commitBlob) {
TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, StringRef commitBlob, UID id) {
BinaryReader rdr(commitBlob, AssumeVersion(g_network->protocolVersion()));
while (!rdr.empty()) {
if (*(int32_t*)rdr.peekBytes(4) == VERSION_HEADER) {
@ -93,7 +107,7 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
MutationRef m;
BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
br >> m;
TraceEvent event = debugMutation(context, version, m);
TraceEvent event = debugMutation(context, version, m, id);
if (event.isEnabled()) {
event.detail("MessageTags", msg.tags);
return event;
@ -104,23 +118,23 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
}
#if MUTATION_TRACKING_ENABLED
TraceEvent debugMutation(const char* context, Version version, MutationRef const& mutation) {
return debugMutationEnabled(context, version, mutation);
TraceEvent debugMutation(const char* context, Version version, MutationRef const& mutation, UID id) {
return debugMutationEnabled(context, version, mutation, id);
}
TraceEvent debugKeyRange(const char* context, Version version, KeyRangeRef const& keys) {
return debugKeyRangeEnabled(context, version, keys);
TraceEvent debugKeyRange(const char* context, Version version, KeyRangeRef const& keys, UID id) {
return debugKeyRangeEnabled(context, version, keys, id);
}
TraceEvent debugTagsAndMessage(const char* context, Version version, StringRef commitBlob) {
return debugTagsAndMessageEnabled(context, version, commitBlob);
TraceEvent debugTagsAndMessage(const char* context, Version version, StringRef commitBlob, UID id) {
return debugTagsAndMessageEnabled(context, version, commitBlob, id);
}
#else
TraceEvent debugMutation(const char* context, Version version, MutationRef const& mutation) {
TraceEvent debugMutation(const char* context, Version version, MutationRef const& mutation, UID id) {
return TraceEvent();
}
TraceEvent debugKeyRange(const char* context, Version version, KeyRangeRef const& keys) {
TraceEvent debugKeyRange(const char* context, Version version, KeyRangeRef const& keys, UID id) {
return TraceEvent();
}
TraceEvent debugTagsAndMessage(const char* context, Version version, StringRef commitBlob) {
TraceEvent debugTagsAndMessage(const char* context, Version version, StringRef commitBlob, UID id) {
return TraceEvent();
}
#endif

View File

@ -28,19 +28,18 @@
#define MUTATION_TRACKING_ENABLED 0
// The keys to track are defined in the .cpp file to limit recompilation.
#define DEBUG_MUTATION(context, version, mutation) MUTATION_TRACKING_ENABLED&& debugMutation(context, version, mutation)
TraceEvent debugMutation(const char* context, Version version, MutationRef const& mutation);
#define DEBUG_MUTATION(...) MUTATION_TRACKING_ENABLED && debugMutation(__VA_ARGS__)
TraceEvent debugMutation(const char* context, Version version, MutationRef const& mutation, UID id = UID());
// debugKeyRange and debugTagsAndMessage only log the *first* occurrence of a key in their range/commit.
// TODO: Create a TraceEventGroup that forwards all calls to each element of a vector<TraceEvent>,
// to allow "multiple" TraceEvents to be returned.
#define DEBUG_KEY_RANGE(context, version, keys) MUTATION_TRACKING_ENABLED&& debugKeyRange(context, version, keys)
TraceEvent debugKeyRange(const char* context, Version version, KeyRangeRef const& keys);
#define DEBUG_KEY_RANGE(...) MUTATION_TRACKING_ENABLED && debugKeyRange(__VA_ARGS__)
TraceEvent debugKeyRange(const char* context, Version version, KeyRangeRef const& keys, UID id = UID());
#define DEBUG_TAGS_AND_MESSAGE(context, version, commitBlob) \
MUTATION_TRACKING_ENABLED&& debugTagsAndMessage(context, version, commitBlob)
TraceEvent debugTagsAndMessage(const char* context, Version version, StringRef commitBlob);
#define DEBUG_TAGS_AND_MESSAGE(...) MUTATION_TRACKING_ENABLED && debugTagsAndMessage(__VA_ARGS__)
TraceEvent debugTagsAndMessage(const char* context, Version version, StringRef commitBlob, UID id = UID());
// TODO: Version Tracking. If the bug is in handling a version rather than a key, then it'd be good to be able to log
// each time that version is handled within simulation. A similar set of functions should be implemented.

View File

@ -300,6 +300,7 @@ struct TLogData : NonCopyable {
int64_t instanceID;
int64_t bytesInput;
int64_t bytesDurable;
int activePeekStreams = 0;
Version prevVersion;
@ -477,6 +478,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
});
specialCounter(
cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
}
~LogData() {
@ -931,14 +933,15 @@ ACTOR Future<Void> tLogPop(TLogData* self, TLogPopRequest req, Reference<LogData
}
void peekMessagesFromMemory(Reference<LogData> self,
TLogPeekRequest const& req,
Tag tag,
Version reqBegin,
BinaryWriter& messages,
Version& endVersion) {
OldTag oldTag = convertTag(req.tag);
OldTag oldTag = convertTag(tag);
ASSERT(!messages.getLength());
auto& deque = get_version_messages(self, oldTag);
Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1);
Version begin = std::max(reqBegin, self->persistentDataDurableVersion + 1);
auto it = std::lower_bound(deque.begin(),
deque.end(),
std::make_pair(begin, LengthPrefixedStringRef()),
@ -963,24 +966,33 @@ void peekMessagesFromMemory(Reference<LogData> self,
uint32_t subVersion;
rd >> messageLength >> subVersion;
messageLength += sizeof(uint16_t) + sizeof(Tag);
messages << messageLength << subVersion << uint16_t(1) << req.tag;
messages << messageLength << subVersion << uint16_t(1) << tag;
messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag));
messages.serializeBytes(rd.readBytes(messageLength), messageLength);
}
}
}
ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference<LogData> logData) {
// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request
ACTOR template <typename PromiseType>
Future<Void> tLogPeekMessages(PromiseType replyPromise,
TLogData* self,
Reference<LogData> logData,
Version reqBegin,
Tag reqTag,
bool reqReturnIfBlocked = false,
bool reqOnlySpilled = false,
Optional<std::pair<UID, int>> reqSequence = Optional<std::pair<UID, int>>()) {
state BinaryWriter messages(Unversioned());
state BinaryWriter messages2(Unversioned());
state int sequence = -1;
state UID peekId;
state OldTag oldTag = convertTag(req.tag);
state OldTag oldTag = convertTag(reqTag);
if (req.sequence.present()) {
if (reqSequence.present()) {
try {
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
peekId = reqSequence.get().first;
sequence = reqSequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS &&
self->peekTracker.find(peekId) == self->peekTracker.end()) {
throw operation_obsolete();
@ -989,12 +1001,12 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
auto& trackerData = self->peekTracker[peekId];
trackerData.lastUpdate = now();
Version ver = wait(trackerData.sequence_version[sequence].getFuture());
req.begin = std::max(ver, req.begin);
reqBegin = std::max(ver, reqBegin);
wait(yield());
}
} catch (Error& e) {
if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
replyPromise.sendError(e);
return Void();
} else {
throw;
@ -1002,22 +1014,22 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
}
}
if (req.returnIfBlocked && logData->version.get() < req.begin) {
req.reply.sendError(end_of_stream());
if (reqReturnIfBlocked && logData->version.get() < reqBegin) {
replyPromise.sendError(end_of_stream());
return Void();
}
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2);
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
// Wait until we have something to return that the caller doesn't already have
if (logData->version.get() < req.begin) {
wait(logData->version.whenAtLeast(req.begin));
if (logData->version.get() < reqBegin) {
wait(logData->version.whenAtLeast(reqBegin));
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
}
state Version endVersion = logData->version.get() + 1;
Version poppedVer = poppedVersion(logData, oldTag);
if (poppedVer > req.begin) {
if (poppedVer > reqBegin) {
TLogPeekReply rep;
rep.maxKnownVersion = logData->version.get();
rep.minKnownCommittedVersion = 0;
@ -1025,12 +1037,12 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
rep.end = poppedVer;
rep.onlySpilled = false;
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
trackerData.lastUpdate = now();
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
@ -1038,37 +1050,37 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get() != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(rep.end);
}
rep.begin = req.begin;
rep.begin = reqBegin;
}
req.reply.send(rep);
replyPromise.send(rep);
return Void();
}
// grab messages from disk
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2);
if (req.begin <= logData->persistentDataDurableVersion) {
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
if (reqBegin <= logData->persistentDataDurableVersion) {
// Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We
// may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if
// an initial attempt to read from disk results in insufficient data and the required data is no longer in
// memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the
// result?
peekMessagesFromMemory(logData, req, messages2, endVersion);
peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
RangeResult kvs = wait(self->persistentData->readRange(
KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, req.begin),
KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, reqBegin),
persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->DESIRED_TOTAL_BYTES,
SERVER_KNOBS->DESIRED_TOTAL_BYTES));
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
for (auto& kv : kvs) {
auto ver = decodeTagMessagesKey(kv.key);
@ -1080,7 +1092,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
uint32_t subVersion;
rd >> messageLength >> subVersion;
messageLength += sizeof(uint16_t) + sizeof(Tag);
messages << messageLength << subVersion << uint16_t(1) << req.tag;
messages << messageLength << subVersion << uint16_t(1) << reqTag;
messageLength -= (sizeof(subVersion) + sizeof(uint16_t) + sizeof(Tag));
messages.serializeBytes(rd.readBytes(messageLength), messageLength);
}
@ -1091,39 +1103,79 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
else
messages.serializeBytes(messages2.toValue());
} else {
peekMessagesFromMemory(logData, req, messages, endVersion);
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
}
TLogPeekReply reply;
reply.maxKnownVersion = logData->version.get();
reply.minKnownCommittedVersion = 0;
reply.onlySpilled = false;
reply.messages = messages.toValue();
reply.messages = StringRef(reply.arena, messages.toValue());
reply.end = endVersion;
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress());
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
trackerData.lastUpdate = now();
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get() != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(reply.end);
}
reply.begin = req.begin;
reply.begin = reqBegin;
}
req.reply.send(reply);
replyPromise.send(reply);
return Void();
}
// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover
ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference<LogData> logData) {
self->activePeekStreams++;
state Version begin = req.begin;
state bool onlySpilled = false;
req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes));
loop {
state TLogPeekStreamReply reply;
state Promise<TLogPeekReply> promise;
state Future<TLogPeekReply> future(promise.getFuture());
try {
wait(req.reply.onReady() && store(reply.rep, future) &&
tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled));
reply.rep.begin = begin;
req.reply.send(reply);
begin = reply.rep.end;
onlySpilled = reply.rep.onlySpilled;
if (reply.rep.end > logData->version.get()) {
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
} else {
wait(delay(0, g_network->getCurrentTask()));
}
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
}
}
}
}
ACTOR Future<Void> doQueueCommit(TLogData* self, Reference<LogData> logData) {
state Version ver = logData->version.get();
state Version commitNumber = self->queueCommitBegin + 1;
@ -1288,7 +1340,13 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
PromiseStream<Void> warningCollectorInput) {
loop choose {
when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) {
logData->addActor.send(tLogPeekMessages(self, req, logData));
logData->addActor.send(tLogPeekMessages(
req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence));
}
when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) {
TraceEvent(SevDebug, "TLogPeekStream", logData->logId)
.detail("Token", tli.peekStreamMessages.getEndpoint().token);
logData->addActor.send(tLogPeekStream(self, req, logData));
}
when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) {
logData->addActor.send(tLogPop(self, req, logData));
@ -1435,6 +1493,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
recruited.initEndpoints();
DUMPTOKEN(recruited.peekMessages);
DUMPTOKEN(recruited.peekStreamMessages);
DUMPTOKEN(recruited.popMessages);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.lock);
@ -1574,7 +1633,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
state TLogData self(tlogId, workerID, persistentData, persistentQueue, db);
state Future<Void> error = actorCollection(self.sharedActors.getFuture());
TraceEvent("SharedTlog", tlogId).log();
TraceEvent("SharedTlog", tlogId).detail("Version", "4.6");
try {
wait(restorePersistentState(&self, locality));

View File

@ -276,6 +276,7 @@ struct TLogData : NonCopyable {
int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling.
int64_t overheadBytesInput;
int64_t overheadBytesDurable;
int activePeekStreams = 0;
WorkerCache<TLogInterface> tlogCache;
@ -572,6 +573,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
});
specialCounter(
cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
}
~LogData() {
@ -1172,15 +1174,16 @@ std::deque<std::pair<Version, LengthPrefixedStringRef>>& getVersionMessages(Refe
};
void peekMessagesFromMemory(Reference<LogData> self,
TLogPeekRequest const& req,
Tag tag,
Version begin,
BinaryWriter& messages,
Version& endVersion) {
ASSERT(!messages.getLength());
auto& deque = getVersionMessages(self, req.tag);
auto& deque = getVersionMessages(self, tag);
//TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size());
Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1);
begin = std::max(begin, self->persistentDataDurableVersion + 1);
auto it = std::lower_bound(deque.begin(),
deque.end(),
std::make_pair(begin, LengthPrefixedStringRef()),
@ -1203,29 +1206,38 @@ void peekMessagesFromMemory(Reference<LogData> self,
}
}
ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference<LogData> logData) {
// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request
ACTOR template <typename PromiseType>
Future<Void> tLogPeekMessages(PromiseType replyPromise,
TLogData* self,
Reference<LogData> logData,
Version reqBegin,
Tag reqTag,
bool reqReturnIfBlocked = false,
bool reqOnlySpilled = false,
Optional<std::pair<UID, int>> reqSequence = Optional<std::pair<UID, int>>()) {
state BinaryWriter messages(Unversioned());
state BinaryWriter messages2(Unversioned());
state int sequence = -1;
state UID peekId;
state double queueStart = now();
if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) {
req.tag.id = req.tag.id % logData->txsTags;
if (reqTag.locality == tagLocalityTxs && reqTag.id >= logData->txsTags && logData->txsTags > 0) {
reqTag.id = reqTag.id % logData->txsTags;
}
if (req.sequence.present()) {
if (reqSequence.present()) {
try {
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
peekId = reqSequence.get().first;
sequence = reqSequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS &&
logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
throw operation_obsolete();
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.tag = req.tag;
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
trackerData.tag = reqTag;
trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
while (trackerData.sequence_version.size() &&
@ -1252,12 +1264,12 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(fPrevPeekData);
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
reqBegin = std::max(prevPeekData.first, reqBegin);
reqOnlySpilled = prevPeekData.second;
wait(yield());
} catch (Error& e) {
if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
replyPromise.sendError(e);
return Void();
} else {
throw;
@ -1267,32 +1279,32 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
state double blockStart = now();
if (req.returnIfBlocked && logData->version.get() < req.begin) {
req.reply.sendError(end_of_stream());
if (req.sequence.present()) {
if (reqReturnIfBlocked && logData->version.get() < reqBegin) {
replyPromise.sendError(end_of_stream());
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(req.begin, req.onlySpilled));
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
}
}
return Void();
}
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2);
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
// Wait until we have something to return that the caller doesn't already have
if (logData->version.get() < req.begin) {
wait(logData->version.whenAtLeast(req.begin));
if (logData->version.get() < reqBegin) {
wait(logData->version.whenAtLeast(reqBegin));
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
}
if (logData->locality != tagLocalitySatellite && req.tag.locality == tagLocalityLogRouter) {
if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) {
wait(self->concurrentLogRouterReads.take());
state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads);
wait(delay(0.0, TaskPriority::Low));
}
if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) {
if (reqBegin <= logData->persistentDataDurableVersion && reqTag.locality != tagLocalityTxs && reqTag != txsTag) {
// Reading spilled data will almost always imply that the storage server is >5s behind the rest
// of the cluster. We shouldn't prioritize spending CPU on helping this server catch up
// slightly faster over keeping the rest of the cluster operating normally.
@ -1303,8 +1315,8 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
state double workStart = now();
Version poppedVer = poppedVersion(logData, req.tag);
if (poppedVer > req.begin) {
Version poppedVer = poppedVersion(logData, reqTag);
if (poppedVer > reqBegin) {
TLogPeekReply rep;
rep.maxKnownVersion = logData->version.get();
rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
@ -1312,12 +1324,12 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
rep.end = poppedVer;
rep.onlySpilled = false;
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
trackerData.lastUpdate = now();
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
@ -1325,16 +1337,16 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
}
rep.begin = req.begin;
rep.begin = reqBegin;
}
req.reply.send(rep);
replyPromise.send(rep);
return Void();
}
@ -1342,27 +1354,27 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
state bool onlySpilled = false;
// grab messages from disk
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2);
if (req.begin <= logData->persistentDataDurableVersion) {
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
if (reqBegin <= logData->persistentDataDurableVersion) {
// Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We
// may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if
// an initial attempt to read from disk results in insufficient data and the required data is no longer in
// memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the
// result?
if (req.onlySpilled) {
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
peekMessagesFromMemory(logData, req, messages2, endVersion);
peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
}
RangeResult kvs = wait(self->persistentData->readRange(
KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin),
persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin),
persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->DESIRED_TOTAL_BYTES,
SERVER_KNOBS->DESIRED_TOTAL_BYTES));
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().address).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
for (auto& kv : kvs) {
auto ver = decodeTagMessagesKey(kv.key);
@ -1377,20 +1389,20 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
messages.serializeBytes(messages2.toValue());
}
} else {
peekMessagesFromMemory(logData, req, messages, endVersion);
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().address).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
}
TLogPeekReply reply;
reply.maxKnownVersion = logData->version.get();
reply.minKnownCommittedVersion = logData->minKnownCommittedVersion;
reply.messages = messages.toValue();
reply.messages = StringRef(reply.arena, messages.toValue());
reply.end = endVersion;
reply.onlySpilled = onlySpilled;
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().address);
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", replyPromise.getEndpoint().address);
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
trackerData.lastUpdate = now();
@ -1414,7 +1426,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
@ -1423,19 +1435,59 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
trackerData.duplicatePeeks++;
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
}
reply.begin = req.begin;
reply.begin = reqBegin;
}
req.reply.send(reply);
replyPromise.send(reply);
return Void();
}
// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover
ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference<LogData> logData) {
self->activePeekStreams++;
state Version begin = req.begin;
state bool onlySpilled = false;
req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes));
loop {
state TLogPeekStreamReply reply;
state Promise<TLogPeekReply> promise;
state Future<TLogPeekReply> future(promise.getFuture());
try {
wait(req.reply.onReady() && store(reply.rep, future) &&
tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled));
reply.rep.begin = begin;
req.reply.send(reply);
begin = reply.rep.end;
onlySpilled = reply.rep.onlySpilled;
if (reply.rep.end > logData->version.get()) {
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
} else {
wait(delay(0, g_network->getCurrentTask()));
}
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
}
}
}
}
ACTOR Future<Void> doQueueCommit(TLogData* self,
Reference<LogData> logData,
std::vector<Reference<LogData>> missingFinalCommit) {
@ -1930,7 +1982,13 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
}
}
when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) {
logData->addActor.send(tLogPeekMessages(self, req, logData));
logData->addActor.send(tLogPeekMessages(
req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence));
}
when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) {
TraceEvent(SevDebug, "TLogPeekStream", logData->logId)
.detail("Token", tli.peekStreamMessages.getEndpoint().token);
logData->addActor.send(tLogPeekStream(self, req, logData));
}
when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) {
logData->addActor.send(tLogPop(self, req, logData));
@ -2327,6 +2385,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
recruited.initEndpoints();
DUMPTOKEN(recruited.peekMessages);
DUMPTOKEN(recruited.peekStreamMessages);
DUMPTOKEN(recruited.popMessages);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.lock);
@ -2537,6 +2596,7 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
recruited.initEndpoints();
DUMPTOKEN(recruited.peekMessages);
DUMPTOKEN(recruited.peekStreamMessages);
DUMPTOKEN(recruited.popMessages);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.lock);
@ -2729,7 +2789,8 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
state TLogData self(tlogId, workerID, persistentData, persistentQueue, db, degraded, folder);
state Future<Void> error = actorCollection(self.sharedActors.getFuture());
TraceEvent("SharedTlog", tlogId).log();
TraceEvent("SharedTlog", tlogId).detail("Version", "6.0");
try {
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));

View File

@ -339,6 +339,7 @@ struct TLogData : NonCopyable {
int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling.
int64_t overheadBytesInput;
int64_t overheadBytesDurable;
int activePeekStreams = 0;
WorkerCache<TLogInterface> tlogCache;
FlowLock peekMemoryLimiter;
@ -661,6 +662,7 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
specialCounter(cc, "ActivePeekStreams", [tLogData]() { return tLogData->activePeekStreams; });
}
~LogData() {
@ -1440,17 +1442,19 @@ ACTOR Future<Void> tLogPopCore(TLogData* self, Tag inputTag, Version to, Referen
}
uint64_t PoppedVersionLag = logData->persistentDataDurableVersion - logData->queuePoppedVersion;
if ( SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE &&
(logData->queuePoppedVersion > 0) && //avoid generating massive events at beginning
(tagData->unpoppedRecovered || PoppedVersionLag >= SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { //when recovery or long lag
if (SERVER_KNOBS->ENABLE_DETAILED_TLOG_POP_TRACE &&
(logData->queuePoppedVersion > 0) && // avoid generating massive events at beginning
(tagData->unpoppedRecovered ||
PoppedVersionLag >=
SERVER_KNOBS->TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE)) { // when recovery or long lag
TraceEvent("TLogPopDetails", logData->logId)
.detail("Tag", tagData->tag.toString())
.detail("UpTo", upTo)
.detail("PoppedVersionLag", PoppedVersionLag)
.detail("MinPoppedTag", logData->minPoppedTag.toString())
.detail("QueuePoppedVersion", logData->queuePoppedVersion)
.detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False")
.detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False");
.detail("Tag", tagData->tag.toString())
.detail("UpTo", upTo)
.detail("PoppedVersionLag", PoppedVersionLag)
.detail("MinPoppedTag", logData->minPoppedTag.toString())
.detail("QueuePoppedVersion", logData->queuePoppedVersion)
.detail("UnpoppedRecovered", tagData->unpoppedRecovered ? "True" : "False")
.detail("NothingPersistent", tagData->nothingPersistent ? "True" : "False");
}
if (upTo > logData->persistentDataDurableVersion)
wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop));
@ -1487,15 +1491,16 @@ ACTOR Future<Void> tLogPop(TLogData* self, TLogPopRequest req, Reference<LogData
}
void peekMessagesFromMemory(Reference<LogData> self,
TLogPeekRequest const& req,
Tag tag,
Version begin,
BinaryWriter& messages,
Version& endVersion) {
ASSERT(!messages.getLength());
auto& deque = getVersionMessages(self, req.tag);
auto& deque = getVersionMessages(self, tag);
//TraceEvent("TLogPeekMem", self->dbgid).detail("Tag", req.tag1).detail("PDS", self->persistentDataSequence).detail("PDDS", self->persistentDataDurableSequence).detail("Oldest", map1.empty() ? 0 : map1.begin()->key ).detail("OldestMsgCount", map1.empty() ? 0 : map1.begin()->value.size());
Version begin = std::max(req.begin, self->persistentDataDurableVersion + 1);
begin = std::max(begin, self->persistentDataDurableVersion + 1);
auto it = std::lower_bound(deque.begin(),
deque.end(),
std::make_pair(begin, LengthPrefixedStringRef()),
@ -1540,29 +1545,38 @@ ACTOR Future<std::vector<StringRef>> parseMessagesForTag(StringRef commitBlob, T
return relevantMessages;
}
ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Reference<LogData> logData) {
// Common logics to peek TLog and create TLogPeekReply that serves both streaming peek or normal peek request
ACTOR template <typename PromiseType>
Future<Void> tLogPeekMessages(PromiseType replyPromise,
TLogData* self,
Reference<LogData> logData,
Version reqBegin,
Tag reqTag,
bool reqReturnIfBlocked = false,
bool reqOnlySpilled = false,
Optional<std::pair<UID, int>> reqSequence = Optional<std::pair<UID, int>>()) {
state BinaryWriter messages(Unversioned());
state BinaryWriter messages2(Unversioned());
state int sequence = -1;
state UID peekId;
state double queueStart = now();
if (req.tag.locality == tagLocalityTxs && req.tag.id >= logData->txsTags && logData->txsTags > 0) {
req.tag.id = req.tag.id % logData->txsTags;
if (reqTag.locality == tagLocalityTxs && reqTag.id >= logData->txsTags && logData->txsTags > 0) {
reqTag.id = reqTag.id % logData->txsTags;
}
if (req.sequence.present()) {
if (reqSequence.present()) {
try {
peekId = req.sequence.get().first;
sequence = req.sequence.get().second;
peekId = reqSequence.get().first;
sequence = reqSequence.get().second;
if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS &&
logData->peekTracker.find(peekId) == logData->peekTracker.end()) {
throw operation_obsolete();
}
auto& trackerData = logData->peekTracker[peekId];
if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) {
trackerData.tag = req.tag;
trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled));
trackerData.tag = reqTag;
trackerData.sequence_version[0].send(std::make_pair(reqBegin, reqOnlySpilled));
}
auto seqBegin = trackerData.sequence_version.begin();
// The peek cursor and this comparison need to agree about the maximum number of in-flight requests.
@ -1589,12 +1603,12 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
}
trackerData.lastUpdate = now();
std::pair<Version, bool> prevPeekData = wait(fPrevPeekData);
req.begin = std::max(prevPeekData.first, req.begin);
req.onlySpilled = prevPeekData.second;
reqBegin = std::max(prevPeekData.first, reqBegin);
reqOnlySpilled = prevPeekData.second;
wait(yield());
} catch (Error& e) {
if (e.code() == error_code_timed_out || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
replyPromise.sendError(e);
return Void();
} else {
throw;
@ -1604,32 +1618,32 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
state double blockStart = now();
if (req.returnIfBlocked && logData->version.get() < req.begin) {
req.reply.sendError(end_of_stream());
if (req.sequence.present()) {
if (reqReturnIfBlocked && logData->version.get() < reqBegin) {
replyPromise.sendError(end_of_stream());
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(req.begin, req.onlySpilled));
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
}
}
return Void();
}
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2);
//TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
// Wait until we have something to return that the caller doesn't already have
if (logData->version.get() < req.begin) {
wait(logData->version.whenAtLeast(req.begin));
if (logData->version.get() < reqBegin) {
wait(logData->version.whenAtLeast(reqBegin));
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
}
if (req.tag.locality == tagLocalityLogRouter) {
if (reqTag.locality == tagLocalityLogRouter) {
wait(self->concurrentLogRouterReads.take());
state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads);
wait(delay(0.0, TaskPriority::Low));
}
if (req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) {
if (reqBegin <= logData->persistentDataDurableVersion && reqTag.locality != tagLocalityTxs && reqTag != txsTag) {
// Reading spilled data will almost always imply that the storage server is >5s behind the rest
// of the cluster. We shouldn't prioritize spending CPU on helping this server catch up
// slightly faster over keeping the rest of the cluster operating normally.
@ -1640,8 +1654,8 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
state double workStart = now();
Version poppedVer = poppedVersion(logData, req.tag);
if (poppedVer > req.begin) {
Version poppedVer = poppedVersion(logData, reqTag);
if (poppedVer > reqBegin) {
TLogPeekReply rep;
rep.maxKnownVersion = logData->version.get();
rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
@ -1649,12 +1663,12 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
rep.end = poppedVer;
rep.onlySpilled = false;
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
trackerData.lastUpdate = now();
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
@ -1662,16 +1676,16 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
TEST(true); // tlog peek second attempt ended at a different version
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
}
rep.begin = req.begin;
rep.begin = reqBegin;
}
req.reply.send(rep);
replyPromise.send(rep);
return Void();
}
@ -1679,24 +1693,24 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
state bool onlySpilled = false;
// grab messages from disk
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", req.begin.epoch).detail("ReqBeginSeq", req.begin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", req.tag1).detail("Tag2", req.tag2);
if (req.begin <= logData->persistentDataDurableVersion) {
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
if (reqBegin <= logData->persistentDataDurableVersion) {
// Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We
// may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if
// an initial attempt to read from disk results in insufficient data and the required data is no longer in
// memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the
// result?
if (req.onlySpilled) {
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
peekMessagesFromMemory(logData, req, messages2, endVersion);
peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
}
if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) {
if (reqTag.locality == tagLocalityTxs || reqTag == txsTag) {
RangeResult kvs = wait(self->persistentData->readRange(
KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin),
persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin),
persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->DESIRED_TOTAL_BYTES,
SERVER_KNOBS->DESIRED_TOTAL_BYTES));
@ -1716,11 +1730,11 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
RangeResult kvrefs = wait(self->persistentData->readRange(
KeyRangeRef(
persistTagMessageRefsKey(logData->logId, req.tag, req.begin),
persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
persistTagMessageRefsKey(logData->logId, reqTag, reqBegin),
persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1));
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
state std::vector<std::pair<IDiskQueue::location, IDiskQueue::location>> commitLocations;
state bool earlyEnd = false;
@ -1737,7 +1751,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
earlyEnd = true;
break;
}
if (sd.version >= req.begin) {
if (sd.version >= reqBegin) {
firstVersion = std::min(firstVersion, sd.version);
const IDiskQueue::location end = sd.start.lo + sd.length;
commitLocations.emplace_back(sd.start, end);
@ -1779,7 +1793,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
messages << VERSION_HEADER << entry.version;
std::vector<StringRef> rawMessages =
wait(parseMessagesForTag(entry.messages, req.tag, logData->logRouterTags));
wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags));
for (const StringRef& msg : rawMessages) {
messages.serializeBytes(msg);
}
@ -1799,25 +1813,25 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
}
}
} else {
if (req.onlySpilled) {
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
peekMessagesFromMemory(logData, req, messages, endVersion);
peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
}
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
}
TLogPeekReply reply;
reply.maxKnownVersion = logData->version.get();
reply.minKnownCommittedVersion = logData->minKnownCommittedVersion;
reply.messages = messages.toValue();
reply.messages = StringRef(reply.arena, messages.toValue());
reply.end = endVersion;
reply.onlySpilled = onlySpilled;
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress());
//TraceEvent("TlogPeek", self->dbgid).detail("LogId", logData->logId).detail("EndVer", reply.end).detail("MsgBytes", reply.messages.expectedSize()).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress());
if (req.sequence.present()) {
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
trackerData.lastUpdate = now();
@ -1841,7 +1855,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
@ -1850,19 +1864,59 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
trackerData.duplicatePeeks++;
if (sequenceData.getFuture().get().first != reply.end) {
TEST(true); // tlog peek second attempt ended at a different version (2)
req.reply.sendError(operation_obsolete());
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(reply.end, reply.onlySpilled));
}
reply.begin = req.begin;
reply.begin = reqBegin;
}
req.reply.send(reply);
replyPromise.send(reply);
return Void();
}
// This actor keep pushing TLogPeekStreamReply until it's removed from the cluster or should recover
ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Reference<LogData> logData) {
self->activePeekStreams++;
state Version begin = req.begin;
state bool onlySpilled = false;
req.reply.setByteLimit(std::min(SERVER_KNOBS->MAXIMUM_PEEK_BYTES, req.limitBytes));
loop {
state TLogPeekStreamReply reply;
state Promise<TLogPeekReply> promise;
state Future<TLogPeekReply> future(promise.getFuture());
try {
wait(req.reply.onReady() && store(reply.rep, future) &&
tLogPeekMessages(promise, self, logData, begin, req.tag, req.returnIfBlocked, onlySpilled));
reply.rep.begin = begin;
req.reply.send(reply);
begin = reply.rep.end;
onlySpilled = reply.rep.onlySpilled;
if (reply.rep.end > logData->version.get()) {
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
} else {
wait(delay(0, g_network->getCurrentTask()));
}
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
return Void();
} else {
throw;
}
}
}
}
ACTOR Future<Void> watchDegraded(TLogData* self) {
if (g_network->isSimulated() && g_simulator.speedUpSimulation) {
return Void();
@ -2373,7 +2427,13 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
}
}
when(TLogPeekRequest req = waitNext(tli.peekMessages.getFuture())) {
logData->addActor.send(tLogPeekMessages(self, req, logData));
logData->addActor.send(tLogPeekMessages(
req.reply, self, logData, req.begin, req.tag, req.returnIfBlocked, req.onlySpilled, req.sequence));
}
when(TLogPeekStreamRequest req = waitNext(tli.peekStreamMessages.getFuture())) {
TraceEvent(SevDebug, "TLogPeekStream", logData->logId)
.detail("Token", tli.peekStreamMessages.getEndpoint().token);
logData->addActor.send(tLogPeekStream(self, req, logData));
}
when(TLogPopRequest req = waitNext(tli.popMessages.getFuture())) {
logData->addActor.send(tLogPop(self, req, logData));
@ -2788,6 +2848,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
recruited.initEndpoints();
DUMPTOKEN(recruited.peekMessages);
DUMPTOKEN(recruited.peekStreamMessages);
DUMPTOKEN(recruited.popMessages);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.lock);
@ -2826,9 +2887,9 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
logsByVersion.emplace_back(ver, id1);
TraceEvent("TLogPersistentStateRestore", self->dbgid)
.detail("LogId", logData->logId)
.detail("Ver", ver)
.detail("RecoveryCount", logData->recoveryCount);
.detail("LogId", logData->logId)
.detail("Ver", ver)
.detail("RecoveryCount", logData->recoveryCount);
// Restore popped keys. Pop operations that took place after the last (committed) updatePersistentDataVersion
// might be lost, but that is fine because we will get the corresponding data back, too.
tagKeys = prefixRange(rawId.withPrefix(persistTagPoppedKeys.begin));
@ -3019,6 +3080,7 @@ ACTOR Future<Void> tLogStart(TLogData* self, InitializeTLogRequest req, Locality
recruited.initEndpoints();
DUMPTOKEN(recruited.peekMessages);
DUMPTOKEN(recruited.peekStreamMessages);
DUMPTOKEN(recruited.popMessages);
DUMPTOKEN(recruited.commit);
DUMPTOKEN(recruited.lock);
@ -3218,7 +3280,8 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
state TLogData self(tlogId, workerID, persistentData, persistentQueue, db, degraded, folder);
state Future<Void> error = actorCollection(self.sharedActors.getFuture());
TraceEvent("SharedTlog", tlogId).log();
TraceEvent("SharedTlog", tlogId).detail("Version", "6.2");
try {
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));

Some files were not shown because too many files have changed in this diff Show More