Merge remote-tracking branch 'origin/main' into bugfixes/machines-attrition-debugging

This commit is contained in:
Markus Pilman 2022-10-24 15:24:36 -06:00
commit e7b5b870a3
161 changed files with 3561 additions and 1376 deletions

View File

@ -274,85 +274,21 @@ if(NOT WIN32)
@CLUSTER_FILE@
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
)
add_fdbclient_test(
NAME fdb_c_api_tests
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
)
add_fdbclient_test(
NAME fdb_c_api_tests_local_only
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
)
add_fdbclient_test(
NAME fdb_c_api_tests_blob_granule
DISABLE_LOG_DUMP
API_TEST_BLOB_GRANULES_ENABLED
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests
--blob-granule-local-file-path
@DATA_DIR@/fdbblob/
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
)
add_fdbclient_test(
NAME fdb_c_api_tests_with_tls
DISABLE_LOG_DUMP
TLS_ENABLED
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
--tmp-dir
@TMP_DIR@
--log-dir
@LOG_DIR@
--tls-cert-file
@CLIENT_CERT_FILE@
--tls-key-file
@CLIENT_KEY_FILE@
--tls-ca-file
@SERVER_CA_FILE@
)
file(GLOB API_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/test/apitester/tests/*.toml")
foreach(test_file ${API_TEST_FILES})
get_filename_component(file_name "${test_file}" NAME_WE)
set(test_name "fdb_c_api_test_${file_name}")
add_test(NAME "${test_name}"
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--build-dir ${CMAKE_BINARY_DIR}
--api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
--external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-file ${test_file}
--knob delete-native-lib-after-loading=false
)
set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
endforeach()
add_test(NAME fdb_c_upgrade_to_future_version
COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py

View File

@ -1,15 +0,0 @@
[[test]]
title = 'Blob Granule API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -1,15 +0,0 @@
[[test]]
title = 'Blob Granule Errors Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -279,9 +279,9 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) {
return true;
}
void fdb_check(fdb::Error e) {
if (e) {
fmt::print(stderr, "Unexpected FDB error: {}({})\n", e.code(), e.what());
void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) {
if (e.code()) {
fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what());
std::abort();
}
}
@ -453,13 +453,13 @@ int main(int argc, char** argv) {
applyNetworkOptions(options);
fdb::network::setup();
std::thread network_thread{ &fdb::network::run };
std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };
if (!runWorkloads(options)) {
retCode = 1;
}
fdb_check(fdb::network::stop());
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join();
} catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());

View File

@ -1,29 +0,0 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
minDatabases = 1
maxDatabases = 3
multiThreaded = false
disableClientBypass = true
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -29,31 +29,39 @@ from pathlib import Path
import glob
import random
import string
import toml
sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "tests", "TestRunner")]
# fmt: off
from tmp_cluster import TempCluster
from local_cluster import TLSConfig
# fmt: on
TESTER_STATS_INTERVAL_SEC = 5
def random_string(len):
return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len))
return "".join(random.choice(string.ascii_letters + string.digits) for i in range(len))
def get_logger():
return logging.getLogger('foundationdb.run_c_api_tests')
return logging.getLogger("foundationdb.run_c_api_tests")
def initialize_logger_level(logging_level):
logger = get_logger()
assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
assert logging_level in ["DEBUG", "INFO", "WARNING", "ERROR"]
logging.basicConfig(format='%(message)s')
if logging_level == 'DEBUG':
logging.basicConfig(format="%(message)s")
if logging_level == "DEBUG":
logger.setLevel(logging.DEBUG)
elif logging_level == 'INFO':
elif logging_level == "INFO":
logger.setLevel(logging.INFO)
elif logging_level == 'WARNING':
elif logging_level == "WARNING":
logger.setLevel(logging.WARNING)
elif logging_level == 'ERROR':
elif logging_level == "ERROR":
logger.setLevel(logging.ERROR)
@ -65,35 +73,52 @@ def dump_client_logs(log_dir):
print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))
def run_tester(args, test_file):
cmd = [args.tester_binary,
"--cluster-file", args.cluster_file,
"--test-file", test_file,
"--stats-interval", str(TESTER_STATS_INTERVAL_SEC*1000)]
def run_tester(args, cluster, test_file):
build_dir = Path(args.build_dir).resolve()
tester_binary = Path(args.api_tester_bin).resolve()
external_client_library = build_dir.joinpath("bindings", "c", "libfdb_c_external.so")
log_dir = Path(cluster.log).joinpath("client")
log_dir.mkdir(exist_ok=True)
cmd = [
tester_binary,
"--cluster-file",
cluster.cluster_file,
"--test-file",
test_file,
"--stats-interval",
str(TESTER_STATS_INTERVAL_SEC * 1000),
"--tmp-dir",
cluster.tmp_dir,
"--log",
"--log-dir",
str(log_dir),
]
if args.external_client_library is not None:
cmd += ["--external-client-library", args.external_client_library]
if args.tmp_dir is not None:
cmd += ["--tmp-dir", args.tmp_dir]
log_dir = None
if args.log_dir is not None:
log_dir = Path(args.log_dir).joinpath(random_string(8))
log_dir.mkdir(exist_ok=True)
cmd += ['--log', "--log-dir", str(log_dir)]
external_client_library = Path(args.external_client_library).resolve()
cmd += ["--external-client-library", external_client_library]
if args.blob_granule_local_file_path is not None:
cmd += ["--blob-granule-local-file-path",
args.blob_granule_local_file_path]
if cluster.blob_granules_enabled:
cmd += [
"--blob-granule-local-file-path",
str(cluster.data.joinpath("fdbblob")) + os.sep,
]
if args.tls_ca_file is not None:
cmd += ["--tls-ca-file", args.tls_ca_file]
if cluster.tls_config is not None:
cmd += [
"--tls-ca-file",
cluster.server_ca_file,
"--tls-key-file",
cluster.client_key_file,
"--tls-cert-file",
cluster.client_cert_file,
]
if args.tls_key_file is not None:
cmd += ["--tls-key-file", args.tls_key_file]
for knob in args.knobs:
knob_name, knob_value = knob.split("=")
cmd += ["--knob-" + knob_name, knob_value]
if args.tls_cert_file is not None:
cmd += ["--tls-cert-file", args.tls_cert_file]
get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
get_logger().info("\nRunning tester '%s'..." % " ".join(map(str, cmd)))
proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
timed_out = False
ret_code = 1
@ -103,34 +128,76 @@ def run_tester(args, test_file):
proc.kill()
timed_out = True
except Exception as e:
raise Exception('Unable to run tester (%s)' % e)
raise Exception("Unable to run tester (%s)" % e)
if ret_code != 0:
if timed_out:
reason = 'timed out after %d seconds' % args.timeout
reason = "timed out after %d seconds" % args.timeout
elif ret_code < 0:
reason = signal.Signals(-ret_code).name
else:
reason = 'exit code: %d' % ret_code
get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
(cmd[0], reason))
if (log_dir is not None):
reason = "exit code: %d" % ret_code
get_logger().error("\n'%s' did not complete succesfully (%s)" % (cmd[0], reason))
if log_dir is not None:
dump_client_logs(log_dir)
get_logger().info('')
get_logger().info("")
return ret_code
class TestConfig:
def __init__(self, test_file):
config = toml.load(test_file)
server_config = config.get("server", [{}])[0]
self.tenants_enabled = server_config.get("tenants_enabled", True)
self.blob_granules_enabled = server_config.get("blob_granules_enabled", False)
self.tls_enabled = server_config.get("tls_enabled", False)
self.client_chain_len = server_config.get("tls_client_chain_len", 2)
self.server_chain_len = server_config.get("tls_server_chain_len", 3)
self.min_num_processes = server_config.get("min_num_processes", 1)
self.max_num_processes = server_config.get("max_num_processes", 3)
self.num_processes = random.randint(self.min_num_processes, self.max_num_processes)
def run_test(args, test_file):
config = TestConfig(test_file)
tls_config = None
if config.tls_enabled:
tls_config = TLSConfig(
server_chain_len=config.client_chain_len,
client_chain_len=config.server_chain_len,
)
with TempCluster(
args.build_dir,
config.num_processes,
enable_tenants=config.tenants_enabled,
blob_granules_enabled=config.blob_granules_enabled,
tls_config=tls_config,
) as cluster:
ret_code = run_tester(args, cluster, test_file)
if not cluster.check_cluster_logs():
ret_code = 1 if ret_code == 0 else ret_code
return ret_code
def run_tests(args):
num_failed = 0
test_files = [f for f in os.listdir(args.test_dir) if os.path.isfile(
os.path.join(args.test_dir, f)) and f.endswith(".toml")]
if args.test_file is not None:
test_files = [Path(args.test_file).resolve()]
else:
test_files = [
f
for f in os.listdir(args.test_dir)
if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")
]
for test_file in test_files:
get_logger().info('=========================================================')
get_logger().info('Running test %s' % test_file)
get_logger().info('=========================================================')
ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
get_logger().info("=========================================================")
get_logger().info("Running test %s" % test_file)
get_logger().info("=========================================================")
ret_code = run_test(args, os.path.join(args.test_dir, test_file))
if ret_code != 0:
num_failed += 1
@ -138,32 +205,49 @@ def run_tests(args):
def parse_args(argv):
parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
parser.add_argument('--external-client-library', type=str, default=None,
help='Path to the external client library. (default: None)')
parser.add_argument('--test-dir', type=str, default="./",
help='Path to a directory with test definitions. (default: ./)')
parser.add_argument('--timeout', type=int, default=300,
help='The timeout in seconds for running each individual test. (default 300)')
parser.add_argument('--log-dir', type=str, default=None,
help='The directory for storing logs (default: None)')
parser.add_argument('--logging-level', type=str, default='INFO',
choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
parser.add_argument('--tmp-dir', type=str, default=None,
help='The directory for storing temporary files (default: None)')
parser.add_argument('--blob-granule-local-file-path', type=str, default=None,
help='Enable blob granule tests if set, value is path to local blob granule files')
parser.add_argument('--tls-ca-file', type=str, default=None,
help='Path to client\'s TLS CA file: i.e. certificate of CA that signed the server certificate')
parser.add_argument('--tls-cert-file', type=str, default=None,
help='Path to client\'s TLS certificate file')
parser.add_argument('--tls-key-file', type=str, default=None,
help='Path to client\'s TLS private key file')
parser = argparse.ArgumentParser(description="FoundationDB C API Tester")
parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory")
parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True)
parser.add_argument("--external-client-library", type=str, help="Path to the external client library.")
parser.add_argument(
"--cluster-file",
type=str,
default="fdb.cluster",
help="The cluster file for the cluster being connected to. (default: fdb.cluster)",
)
parser.add_argument(
"--test-dir",
type=str,
default="./",
help="Path to a directory with test definitions. (default: ./)",
)
parser.add_argument(
"--test-file",
type=str,
default=None,
help="Path to a single test definition to be executed, overrides --test-dir if set.",
)
parser.add_argument(
"--timeout",
type=int,
default=300,
help="The timeout in seconds for running each individual test. (default 300)",
)
parser.add_argument(
"--logging-level",
type=str,
default="INFO",
choices=["ERROR", "WARNING", "INFO", "DEBUG"],
help="Specifies the level of detail in the tester output (default='INFO').",
)
parser.add_argument(
"--knob",
type=str,
default=[],
action="append",
dest="knobs",
help="[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)",
)
return parser.parse_args(argv)
@ -174,5 +258,5 @@ def main(argv):
return run_tests(args)
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -12,13 +12,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,13 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,18 @@
[[test]]
title = 'Blob Granule API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,12 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -11,12 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,18 @@
[[test]]
title = 'Blob Granule Errors Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'BlobGranuleErrors'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100

View File

@ -12,13 +12,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -11,13 +11,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -12,13 +12,13 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,28 @@
[[test]]
title = 'Cancel Transaction with Database per Transaction with TLS'
multiThreaded = true
buggify = true
databasePerTransaction = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -11,15 +11,15 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -12,23 +12,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -0,0 +1,29 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
minDatabases = 1
maxDatabases = 3
multiThreaded = false
disableClientBypass = true
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -11,23 +11,23 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -4,23 +4,23 @@ minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -0,0 +1,37 @@
[[test]]
title = 'API Correctness with TLS'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
[[test.workload]]
name = 'WatchAndWait'
initialSize = 0
numRandomOperations = 10

View File

@ -11,23 +11,22 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000

View File

@ -9,13 +9,13 @@ maxClients = 8
minTenants = 2
maxTenants = 5
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,25 @@
[[test]]
title = 'Multi-tenant API Correctness Multi Threaded'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minClients = 2
maxClients = 8
minTenants = 2
maxTenants = 5
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 5
initialSize = 100
numRandomOperations = 200
readExistingKeysRatio = 0.9

View File

@ -12,13 +12,13 @@ maxClientThreads = 4
minClients = 2
maxClients = 4
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,28 @@
[[test]]
title = 'Test tampering the cluster file with TLS'
multiThreaded = true
buggify = true
tamperClusterFile = true
minFdbThreads = 2
maxFdbThreads = 4
minDatabases = 2
maxDatabases = 4
minClientThreads = 2
maxClientThreads = 4
minClients = 2
maxClients = 4
[[server]]
tls_enabled = true
max_num_processes = 1
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -46,7 +46,7 @@ int main(int argc, char** argv) {
}
fdb_check(fdb_select_api_version(FDB_API_VERSION));
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
fdb_check(
fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));

View File

@ -321,7 +321,16 @@ int populate(Database db,
const auto key_begin = insertBegin(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
const auto key_end = insertEnd(args.rows, worker_id, thread_id, args.num_processes, args.num_threads);
auto key_checkpoint = key_begin; // in case of commit failure, restart from this key
double required_keys = (key_end - key_begin + 1) * args.load_factor;
for (auto i = key_begin; i <= key_end; i++) {
// Choose required_keys out of (key_end -i + 1) randomly, so the probability is required_keys / (key_end - i
// + 1). Generate a random number in range [0, 1), if the generated number is smaller or equal to
// required_keys / (key_end - i + 1), then choose this key.
double r = rand() / (1.0 + RAND_MAX);
if (r > required_keys / (key_end - i + 1)) {
continue;
}
--required_keys;
/* sequential keys */
genKey(keystr.data(), KEY_PREFIX, args, i);
/* random values */
@ -984,6 +993,7 @@ int initArguments(Arguments& args) {
args.async_xacts = 0;
args.mode = MODE_INVALID;
args.rows = 100000;
args.load_factor = 1.0;
args.row_digits = digits(args.rows);
args.seconds = 30;
args.iteration = 0;
@ -1166,6 +1176,7 @@ void usage() {
printf("%-24s %s\n", "-t, --threads=THREADS", "Specify number of worker threads");
printf("%-24s %s\n", " --async_xacts", "Specify number of concurrent transactions to be run in async mode");
printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records");
printf("%-24s %s\n", "-l, --load_factor=LOAD_FACTOR", "Specify load factor");
printf("%-24s %s\n", "-s, --seconds=SECONDS", "Specify the test duration in seconds\n");
printf("%-24s %s\n", "", "This option cannot be specified with --iteration.");
printf("%-24s %s\n", "-i, --iteration=ITERS", "Specify the number of iterations.\n");
@ -1228,6 +1239,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "threads", required_argument, NULL, 't' },
{ "async_xacts", required_argument, NULL, ARG_ASYNC },
{ "rows", required_argument, NULL, 'r' },
{ "load_factor", required_argument, NULL, 'l' },
{ "seconds", required_argument, NULL, 's' },
{ "iteration", required_argument, NULL, 'i' },
{ "keylen", required_argument, NULL, ARG_KEYLEN },
@ -1304,6 +1316,9 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.rows = atoi(optarg);
args.row_digits = digits(args.rows);
break;
case 'l':
args.load_factor = atof(optarg);
break;
case 's':
args.seconds = atoi(optarg);
break;
@ -1523,6 +1538,10 @@ int validateArguments(Arguments const& args) {
logr.error("--rows must be a positive integer");
return -1;
}
if (args.load_factor <= 0 || args.load_factor > 1) {
logr.error("--load_factor must be in range (0, 1]");
return -1;
}
if (args.key_length < 0) {
logr.error("--keylen must be a positive integer");
return -1;
@ -2118,6 +2137,7 @@ int statsProcessMain(Arguments const& args,
fmt::fprintf(fp, "\"async_xacts\": %d,", args.async_xacts);
fmt::fprintf(fp, "\"mode\": %d,", args.mode);
fmt::fprintf(fp, "\"rows\": %d,", args.rows);
fmt::fprintf(fp, "\"load_factor\": %lf,", args.load_factor);
fmt::fprintf(fp, "\"seconds\": %d,", args.seconds);
fmt::fprintf(fp, "\"iteration\": %d,", args.iteration);
fmt::fprintf(fp, "\"tpsmax\": %d,", args.tpsmax);

View File

@ -138,6 +138,7 @@ struct Arguments {
int async_xacts;
int mode;
int rows; /* is 2 billion enough? */
double load_factor;
int row_digits;
int seconds;
int iteration;

View File

@ -233,7 +233,7 @@ int main(int argc, char** argv) {
applyNetworkOptions(options);
fdb::network::setup();
std::thread network_thread{ &fdb::network::run };
std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } };
// Try calling some basic functionality that is available
// in all recent API versions

View File

@ -271,7 +271,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
db = fdb_open_database(argv[1]);
timeoutDb = fdb_open_database(argv[1]);

View File

@ -66,7 +66,7 @@ TEST_CASE("setup") {
},
&context));
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
CHECK(!context.called);
fdb_check(fdb_stop_network());

View File

@ -68,7 +68,7 @@ int main(int argc, char** argv) {
set_net_opt(FDBNetworkOption::FDB_NET_OPTION_TRACE_PARTIAL_FILE_SUFFIX, trace_partial_file_suffix);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
// Apparently you need to open a database to initialize logging
FDBDatabase* out;

View File

@ -2998,7 +2998,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
db = fdb_open_database(argv[1]);
clusterFilePath = std::string(argv[1]);

View File

@ -88,7 +88,7 @@ int main(int argc, char** argv) {
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
std::thread network_thread{ [] { fdb_check(fdb_run_network()); } };
{
FDBCluster* cluster;

View File

@ -392,11 +392,6 @@ func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
return o.setOpt(505, nil)
}
// Set a random idempotency id for all transactions. See the transaction option description for more information.
func (o DatabaseOptions) SetTransactionAutomaticIdempotency() error {
return o.setOpt(506, nil)
}
// Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information.
func (o DatabaseOptions) SetTransactionBypassUnreadable() error {
return o.setOpt(700, nil)
@ -556,18 +551,6 @@ func (o TransactionOptions) SetSizeLimit(param int64) error {
return o.setOpt(503, int64ToBytes(param))
}
// Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes.
//
// Parameter: Unique ID
func (o TransactionOptions) SetIdempotencyId(param string) error {
return o.setOpt(504, []byte(param))
}
// Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future.
func (o TransactionOptions) SetAutomaticIdempotency() error {
return o.setOpt(505, nil)
}
// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
func (o TransactionOptions) SetSnapshotRywEnable() error {
return o.setOpt(600, nil)

View File

@ -320,11 +320,11 @@ function(create_long_running_correctness_package)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${package_files}
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh
${out_dir}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh
${out_dir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files}
${out_dir}/joshua_test

View File

@ -0,0 +1,9 @@
#!/bin/sh
# Simulation currently has memory leaks. We need to investigate before we can enable leak detection in joshua.
export ASAN_OPTIONS="detect_leaks=0"
OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}"
#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false
python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --long-running

View File

@ -0,0 +1,3 @@
#!/bin/bash -u
python3 -m test_harness.timeout --long-running

View File

@ -184,6 +184,8 @@ class Config:
self.reproduce_prefix: str | None = None
self.reproduce_prefix_args = {'type': str, 'required': False,
'help': 'When printing the results, prepend this string to the command'}
self.long_running: bool = False
self.long_running_args = {'action': 'store_true'}
self._env_names: Dict[str, str] = {}
self._config_map = self._build_map()
self._read_env()

View File

@ -303,6 +303,7 @@ class TestRun:
self.stats: str | None = stats
self.expected_unseed: int | None = expected_unseed
self.use_valgrind: bool = config.use_valgrind
self.long_running: bool = config.long_running
self.old_binary_path: Path = config.old_binaries_path
self.buggify_enabled: bool = buggify_enabled
self.fault_injection_enabled: bool = True
@ -375,7 +376,7 @@ class TestRun:
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
did_kill = False
timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds
timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)

View File

@ -384,6 +384,7 @@ class Summary:
child.attributes['Severity'] = '40'
child.attributes['ErrorCount'] = str(self.errors)
self.out.append(child)
self.error = True
if self.was_killed:
child = SummaryTree('ExternalTimeout')
child.attributes['Severity'] = '40'
@ -420,6 +421,7 @@ class Summary:
child = SummaryTree('TestUnexpectedlyNotFinished')
child.attributes['Severity'] = '40'
self.out.append(child)
self.error = True
if self.error_out is not None and len(self.error_out) > 0:
lines = self.error_out.splitlines()
stderr_bytes = 0

View File

@ -47,6 +47,12 @@ Note that the quotas are specified in terms of bytes/second, and internally conv
page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR)
```
To clear a both reserved and total throughput quotas for a tag, run:
```
fdbcli> quota clear <tag>
```
### Limit Calculation
The transaction budget that ratekeeper calculates and distributes to clients (via GRV proxies) for each tag is calculated based on several intermediate rate calculations, outlined in this section.

View File

@ -524,6 +524,12 @@ The ``start`` command will start a new restore on the specified (or default) tag
``--inconsistent-snapshot-only``
Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.
``--user-data``
Restore only the user keyspace. This option should NOT be used alongside --system-metadata (below) and CANNOT be used alongside other specified key ranges.
``--system-metadata``
Restore only the relevant system keyspace. This option should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.
.. program:: fdbrestore abort
``abort``

View File

@ -648,6 +648,16 @@ The subclasses of the ``ApiWorkload`` inherit the following configuration option
initiated by a test script to check if the client workload is successfully progressing after a
cluster change.
The FDB server configuration can be specialized in the section ``[[server]]``:
- ``tenants_enabled``: enable multitenancy (default: true)
- ``blob_granules_enabled``: enable support for blob granules (default: false)
- ``tls_enabled``: enable TLS (default: false)
- ``tls_client_chain_len``: the length of the client-side TLS chain (default: 2)
- ``tls_server_chain_len``: the length of the server-side TLS chain (default: 3)
- ``min_num_processes`` and ``max_num_processes``: the number of FDB server processes to be
randomly selected from the given range (default 1-3)
Executing the Tests
===================
@ -656,19 +666,35 @@ according to its specification. Before that we must create a FDB cluster and pas
a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an
external client library.
For example, we can create a temporary cluster and use it for execution of one of the existing API tests:
The ``run_c_api_tests.py`` script automates execution of the API tests on a local cluster. The cluster
is created according to the options specified in the ``[[server]]`` section of the given test file.
.. code-block:: bash
${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \
${buildDir}/bin/fdb_c_api_tester \
--cluster-file @CLUSTER_FILE@ \
--external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \
${srcDir}/bindings/c/test/apitester/run_c_api_tests.py
--build-dir ${buildDir}
--api-tester-bin ${buildDir}/bin/fdb_c_api_tester
--external-client-library ${buildDir}/bindings/c/libfdb_c_external.so
--test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml
The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part
of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``:
of the regression test suite as ``ctest`` targets with names ``fdb_c_api_test_{file_name}``.
The ``ctest`` targets provide a more convenient way for executing the API tests. We can execute
a single test:
.. code-block:: bash
ctest -R fdb_c_api_tests -VV
ctest -R fdb_c_api_test_CApiCorrectnessMultiThr -VV
or execute all of them in parallel (here ``-j20`` specifies the parallelization level):
.. code-block:: bash
ctest -R fdb_c_api_test_ -j20 --output-on-failure
More sophisticated filters can be applied to execute a selected set of tests, e.g. the tests using TLS:
.. code-block:: bash
ctest -R 'fdb_c_api_test_.*TLS' -j20 --output_on_failure

View File

@ -528,7 +528,8 @@
"duplicate_mutation_fetch_timeout",
"primary_dc_missing",
"fetch_primary_dc_timeout",
"fetch_storage_wiggler_stats_timeout"
"fetch_storage_wiggler_stats_timeout",
"fetch_consistency_scan_info_timeout"
]
},
"issues":[

View File

@ -47,6 +47,7 @@
#include "fdbclient/IKnobCollection.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/S3BlobStore.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/json_spirit/json_spirit_writer_template.h"
#include "flow/Platform.h"
@ -155,6 +156,11 @@ enum {
OPT_RESTORE_CLUSTERFILE_ORIG,
OPT_RESTORE_BEGIN_VERSION,
OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
// The two restore options below allow callers of fdbrestore to divide a normal restore into one which restores just
// the system keyspace and another that restores just the user key space. This is unlike the backup command where
// all keys (both system and user) will be backed up together
OPT_RESTORE_USER_DATA,
OPT_RESTORE_SYSTEM_DATA,
// Shared constants
OPT_CLUSTERFILE,
@ -696,6 +702,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
{ OPT_BACKUPKEYS, "--keys", SO_REQ_SEP },
{ OPT_WAITFORDONE, "-w", SO_NONE },
{ OPT_WAITFORDONE, "--waitfordone", SO_NONE },
{ OPT_RESTORE_USER_DATA, "--user-data", SO_NONE },
{ OPT_RESTORE_SYSTEM_DATA, "--system-metadata", SO_NONE },
{ OPT_RESTORE_VERSION, "--version", SO_REQ_SEP },
{ OPT_RESTORE_VERSION, "-v", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
@ -1187,6 +1195,13 @@ static void printRestoreUsage(bool devhelp) {
printf(" The cluster file for the original database from which the backup was created. The "
"original database\n");
printf(" is only needed to convert a --timestamp argument to a database version.\n");
printf(" --user-data\n"
" Restore only the user keyspace. This option should NOT be used alongside "
"--system-metadata (below) and CANNOT be used alongside other specified key ranges.\n");
printf(
" --system-metadata\n"
" Restore only the relevant system keyspace. This option "
"should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.\n");
if (devhelp) {
#ifdef _WIN32
@ -3367,6 +3382,8 @@ int main(int argc, char* argv[]) {
bool trace = false;
bool quietDisplay = false;
bool dryRun = false;
bool restoreSystemKeys = false;
bool restoreUserKeys = false;
// TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start
bool encryptionEnabled = true;
std::string traceDir = "";
@ -3691,6 +3708,14 @@ int main(int argc, char* argv[]) {
restoreVersion = ver;
break;
}
case OPT_RESTORE_USER_DATA: {
restoreUserKeys = true;
break;
}
case OPT_RESTORE_SYSTEM_DATA: {
restoreSystemKeys = true;
break;
}
case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
inconsistentSnapshotOnly.set(true);
break;
@ -3838,6 +3863,11 @@ int main(int argc, char* argv[]) {
}
}
if (restoreSystemKeys && restoreUserKeys) {
fprintf(stderr, "ERROR: Please only specify one of --user-data or --system-metadata, not both\n");
return FDB_EXIT_ERROR;
}
if (trace) {
if (!traceLogGroup.empty())
setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(traceLogGroup));
@ -3938,10 +3968,30 @@ int main(int argc, char* argv[]) {
// The fastrestore tool does not yet support multiple ranges and is incompatible with tenants
// or other features that back up data in the system keys
if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) {
if (!restoreSystemKeys && !restoreUserKeys && backupKeys.empty() &&
programExe != ProgramExe::FASTRESTORE_TOOL) {
addDefaultBackupRanges(backupKeys);
}
if ((restoreSystemKeys || restoreUserKeys) && programExe == ProgramExe::FASTRESTORE_TOOL) {
fprintf(stderr, "ERROR: Options: --user-data and --system-metadata are not supported with fastrestore\n");
return FDB_EXIT_ERROR;
}
if ((restoreUserKeys || restoreSystemKeys) && !backupKeys.empty()) {
fprintf(stderr,
"ERROR: Cannot specify additional ranges when using --user-data or --system-metadata "
"options\n");
return FDB_EXIT_ERROR;
}
if (restoreUserKeys) {
backupKeys.push_back_deep(backupKeys.arena(), normalKeys);
} else if (restoreSystemKeys) {
for (const auto& r : getSystemBackupRanges()) {
backupKeys.push_back_deep(backupKeys.arena(), r);
}
}
switch (programExe) {
case ProgramExe::AGENT:
if (!initCluster())

View File

@ -56,7 +56,7 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(tag.withPrefix(tagQuotaPrefix));
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
if (!v.present()) {
fmt::print("<empty>\n");
@ -77,11 +77,10 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, double value) {
state Reference<ITransaction> tr = db->createTransaction();
state Key key = tag.withPrefix(tagQuotaPrefix);
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(key);
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
@ -94,8 +93,27 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
} else if (limitType == LimitType::RESERVED) {
quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();
}
ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully updated quota.\n");
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr->clear(ThrottleApi::getTagQuotaKey(tag));
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully cleared quota.\n");
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
@ -104,7 +122,7 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
}
constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
"[reserved_throughput|total_throughput] <value>]";
"[reserved_throughput|total_throughput] <value> | clear <tag>]";
bool exitFailure() {
fmt::print(usage);
@ -117,30 +135,40 @@ namespace fdb_cli {
ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
state bool result = true;
if (tokens.size() != 5 && tokens.size() != 6) {
if (tokens.size() < 3 || tokens.size() > 5) {
return exitFailure();
} else {
auto tag = parseTag(tokens[2]);
auto limitType = parseLimitType(tokens[3]);
if (!tag.present() || !limitType.present()) {
auto const tag = parseTag(tokens[2]);
if (!tag.present()) {
return exitFailure();
}
if (tokens[1] == "get"_sr) {
if (tokens.size() != 4) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
if (!limitType.present()) {
return exitFailure();
}
wait(getQuota(db, tag.get(), limitType.get()));
return true;
} else if (tokens[1] == "set"_sr) {
if (tokens.size() != 5) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
auto const limitValue = parseLimitValue(tokens[4]);
if (!limitValue.present()) {
if (!limitType.present() || !limitValue.present()) {
return exitFailure();
}
wait(setQuota(db, tag.get(), limitType.get(), limitValue.get()));
return true;
} else if (tokens[1] == "clear"_sr) {
if (tokens.size() != 3) {
return exitFailure();
}
wait(clearQuota(db, tag.get()));
return true;
} else {
return exitFailure();
}

View File

@ -542,8 +542,8 @@ void initHelp() {
"Displays the current read version of the database or currently running transaction.");
helpMap["quota"] = CommandHelp("quota",
"quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
"[reserved_throughput|total_throughput] <value>]",
"Get or modify the throughput quota for the specified tag.");
"[reserved_throughput|total_throughput] <value> | clear <tag>]",
"Get, modify, or clear the throughput quota for the specified tag.");
helpMap["reset"] =
CommandHelp("reset",
"reset the current transaction",
@ -1480,6 +1480,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
if (isCommitDesc && tokens.size() == 1) {
// prompt for description and add to txn
state Optional<std::string> raw;
warn.cancel();
while (!raw.present() || raw.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty.\n");
@ -1490,6 +1491,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
std::string line = raw.get();
config_tr->set("\xff\xff/description"_sr, line);
}
warn =
checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
if (transtype == TransType::Db) {
wait(commitTransaction(tr));
} else {
@ -1821,6 +1824,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
if (!intrans) {
// prompt for description and add to txn
state Optional<std::string> raw_desc;
warn.cancel();
while (!raw_desc.present() || raw_desc.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty\n");
@ -1830,6 +1834,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
}
std::string line = raw_desc.get();
config_tr->set("\xff\xff/description"_sr, line);
warn = checkStatus(
timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb);
wait(commitTransaction(config_tr));
} else {
isCommitDesc = true;

View File

@ -103,6 +103,59 @@ def maintenance(logger):
output3 = run_fdbcli_command('maintenance')
assert output3 == no_maintenance_output
@enable_logging()
def quota(logger):
# Should be a noop
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully cleared quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '<empty>'
# Ignored update
command = 'quota set red total_throughput 49152'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota set green total_throughput 32768'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota set green reserved_throughput 16384'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '32768'
command = 'quota get green reserved_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '16384'
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully cleared quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '<empty>'
# Too few arguments, should log help message
command = 'quota get green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
@enable_logging()
def setclass(logger):
@ -1035,6 +1088,7 @@ if __name__ == '__main__':
integer_options()
tls_address_suffix()
knobmanagement()
quota()
else:
assert args.process_number > 1, "Process number should be positive"
coordinators()

View File

@ -63,7 +63,7 @@ public:
m_buffer = Standalone<VectorRef<uint8_t>>(old.slice(size, old.size()));
// Write the old buffer to the underlying file and update the write offset
Future<Void> r = holdWhile(old, m_file->write(old.begin(), size, m_writeOffset));
Future<Void> r = uncancellable(holdWhile(old, m_file->write(old.begin(), size, m_writeOffset)));
m_writeOffset += size;
return r;

View File

@ -1057,6 +1057,9 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg
beginVersion <= delta.clearVersion.get();
if (delta.values.empty()) {
return ParsedDeltaBoundaryRef(delta.key, clearAfter);
} else if (readVersion >= delta.values.back().version && beginVersion <= delta.values.back().version) {
// for all but zero or one delta files, readVersion >= the entire delta file. optimize this case
return ParsedDeltaBoundaryRef(delta.key, clearAfter, delta.values.back());
}
auto valueAtVersion = std::lower_bound(delta.values.begin(),
delta.values.end(),
@ -1324,7 +1327,8 @@ typedef std::priority_queue<MergeStreamNext, std::vector<MergeStreamNext>, Order
static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
const std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>>& streams,
const std::vector<bool> startClears) {
const std::vector<bool> startClears,
GranuleMaterializeStats& stats) {
ASSERT(streams.size() < std::numeric_limits<int16_t>::max());
ASSERT(startClears.size() == streams.size());
@ -1337,6 +1341,10 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
std::set<int16_t, std::greater<int16_t>> activeClears;
int16_t maxActiveClear = -1;
// trade off memory for cpu performance by assuming all inserts
RangeResult result;
int maxExpectedSize = 0;
// check if a given stream is actively clearing
bool clearActive[streams.size()];
for (int16_t i = 0; i < streams.size(); i++) {
@ -1354,10 +1362,12 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
item.streamIdx = i;
item.dataIdx = 0;
next.push(item);
maxExpectedSize += streams[i].size();
result.arena().dependsOn(streams[i].arena());
}
}
result.reserve(result.arena(), maxExpectedSize);
RangeResult result;
std::vector<MergeStreamNext> cur;
cur.reserve(streams.size());
while (!next.empty()) {
@ -1373,6 +1383,7 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
// un-set clears and find latest value for key (if present)
bool foundValue = false;
bool includesSnapshot = cur.back().streamIdx == 0 && chunk.snapshotFile.present();
for (auto& it : cur) {
auto& v = streams[it.streamIdx][it.dataIdx];
if (clearActive[it.streamIdx]) {
@ -1391,7 +1402,14 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
if (v.isSet() && maxActiveClear < it.streamIdx) {
KeyRef finalKey =
chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key;
result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value));
result.push_back(result.arena(), KeyValueRef(finalKey, v.value));
if (!includesSnapshot) {
stats.rowsInserted++;
} else if (it.streamIdx > 0) {
stats.rowsUpdated++;
}
} else if (includesSnapshot) {
stats.rowsCleared++;
}
}
}
@ -1413,6 +1431,36 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk,
}
}
// FIXME: if memory assumption was wrong and result is significantly smaller than total input size, could copy it
// with push_back_deep to a new result. This is rare though
stats.outputBytes += result.expectedSize();
return result;
}
RangeResult materializeJustSnapshot(const BlobGranuleChunkRef& chunk,
Optional<StringRef> snapshotData,
const KeyRange& requestRange,
GranuleMaterializeStats& stats) {
stats.inputBytes += snapshotData.get().size();
Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows = loadSnapshotFile(
chunk.snapshotFile.get().filename, snapshotData.get(), requestRange, chunk.snapshotFile.get().cipherKeysCtx);
RangeResult result;
if (!snapshotRows.empty()) {
result.arena().dependsOn(snapshotRows.arena());
result.reserve(result.arena(), snapshotRows.size());
for (auto& it : snapshotRows) {
// TODO REMOVE validation
ASSERT(it.op == MutationRef::Type::SetValue);
KeyRef finalKey = chunk.tenantPrefix.present() ? it.key.removePrefix(chunk.tenantPrefix.get()) : it.key;
result.push_back(result.arena(), KeyValueRef(finalKey, it.value));
}
stats.outputBytes += result.expectedSize();
stats.snapshotRows += result.size();
}
return result;
}
@ -1421,7 +1469,8 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]) {
StringRef deltaFileData[],
GranuleMaterializeStats& stats) {
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
@ -1438,12 +1487,18 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
requestRange = keyRange;
}
// fast case for only-snapshot read
if (chunk.snapshotFile.present() && chunk.deltaFiles.empty() && chunk.newDeltas.empty()) {
return materializeJustSnapshot(chunk, snapshotData, requestRange, stats);
}
std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>> streams;
std::vector<bool> startClears;
// +1 for possible snapshot, +1 for possible memory deltas
streams.reserve(chunk.deltaFiles.size() + 2);
if (snapshotData.present()) {
stats.inputBytes += snapshotData.get().size();
ASSERT(chunk.snapshotFile.present());
Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows =
loadSnapshotFile(chunk.snapshotFile.get().filename,
@ -1454,13 +1509,17 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
streams.push_back(snapshotRows);
startClears.push_back(false);
arena.dependsOn(streams.back().arena());
stats.snapshotRows += snapshotRows.size();
}
} else {
ASSERT(!chunk.snapshotFile.present());
}
if (BG_READ_DEBUG) {
fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
}
for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
stats.inputBytes += deltaFileData[deltaIdx].size();
bool startClear = false;
auto deltaRows = loadChunkedDeltaFile(chunk.deltaFiles[deltaIdx].filename,
deltaFileData[deltaIdx],
@ -1480,6 +1539,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size());
}
if (!chunk.newDeltas.empty()) {
stats.inputBytes += chunk.newDeltas.expectedSize();
// TODO REMOVE validation
ASSERT(beginVersion <= chunk.newDeltas.front().version);
ASSERT(readVersion >= chunk.newDeltas.back().version);
@ -1491,7 +1551,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
}
}
return mergeDeltaStreams(chunk, streams, startClears);
return mergeDeltaStreams(chunk, streams, startClears, stats);
}
struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted<GranuleLoadFreeHandle> {
@ -1560,8 +1620,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
GranuleLoadIds loadIds[files.size()];
int64_t inputBytes = 0;
int64_t outputBytes = 0;
try {
// Kick off first file reads if parallelism > 1
@ -1586,7 +1644,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
if (!snapshotData.get().begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
}
inputBytes += snapshotData.get().size();
}
// +1 to avoid UBSAN variable length array of size zero
@ -1599,16 +1656,11 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
if (!deltaData[i].begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
}
inputBytes += deltaData[i].size();
}
inputBytes += files[chunkIdx].newDeltas.expectedSize();
// materialize rows from chunk
chunkRows =
materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData);
outputBytes += chunkRows.expectedSize();
chunkRows = materializeBlobGranule(
files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData, stats);
results.arena().dependsOn(chunkRows.arena());
results.append(results.arena(), chunkRows.begin(), chunkRows.size());
@ -1616,8 +1668,6 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
// free once done by forcing FreeHandles to trigger
loadIds[chunkIdx].freeHandles.clear();
}
stats.inputBytes = inputBytes;
stats.outputBytes = outputBytes;
return ErrorOr<RangeResult>(results);
} catch (Error& e) {
return ErrorOr<RangeResult>(e);
@ -2303,6 +2353,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
// expected answer
std::map<KeyRef, ValueRef> expectedData;
Version lastFileEndVersion = 0;
GranuleMaterializeStats stats;
fmt::print("Delta Read [{0} - {1}) @ {2} - {3}\n",
range.begin.printable(),
@ -2322,7 +2373,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
chunk.includedVersion = readVersion;
chunk.snapshotVersion = invalidVersion;
RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized);
RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized, stats);
if (expectedData.size() != actualData.size()) {
fmt::print("Expected Data {0}:\n", expectedData.size());
@ -2430,6 +2481,7 @@ void checkGranuleRead(const KeyValueGen& kvGen,
}
Version lastFileEndVersion = 0;
applyDeltasByVersion(deltaData, range, beginVersion, readVersion, lastFileEndVersion, expectedData);
GranuleMaterializeStats stats;
// actual answer
Standalone<BlobGranuleChunkRef> chunk;
@ -2477,7 +2529,8 @@ void checkGranuleRead(const KeyValueGen& kvGen,
if (beginVersion == 0) {
snapshotPtr = serializedSnapshot;
}
RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs);
RangeResult actualData =
materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs, stats);
if (expectedData.size() != actualData.size()) {
fmt::print("Expected Size {0} != Actual Size {1}\n", expectedData.size(), actualData.size());
@ -2663,6 +2716,14 @@ struct CommonPrefixStats {
int totalKeys = 0;
int minKeySize = 1000000000;
int maxKeySize = 0;
int64_t logicalBytes = 0;
int64_t totalLogicalBytes = 0;
int deltas = 0;
int deltasSet = 0;
int deltasClear = 0;
int deltasNoOp = 0;
int deltasClearAfter = 0;
void addKey(const KeyRef& k) {
if (len == -1) {
@ -2677,7 +2738,38 @@ struct CommonPrefixStats {
maxKeySize = std::max(maxKeySize, k.size());
}
void addKeyValue(const KeyRef& k, const ValueRef& v) {
addKey(k);
logicalBytes += k.size();
logicalBytes += v.size();
}
void addBoundary(const ParsedDeltaBoundaryRef& d) {
addKey(d.key);
deltas++;
if (d.isSet()) {
deltasSet++;
logicalBytes += d.value.size();
} else if (d.isClear()) {
deltasClear++;
} else {
ASSERT(d.isNoOp());
deltasNoOp++;
}
if (d.clearAfter) {
deltasClearAfter++;
}
}
void doneFile() {
totalLogicalBytes += logicalBytes;
fmt::print("Logical Size: {0}\n", logicalBytes);
logicalBytes = 0;
}
Key done() {
doneFile();
ASSERT(len >= 0);
fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key "
"Size: {4}\n",
@ -2686,11 +2778,21 @@ struct CommonPrefixStats {
totalKeySize / totalKeys,
minKeySize,
maxKeySize);
if (deltas > 0) {
fmt::print("Delta stats: {0} deltas, {1} sets, {2} clears, {3} noops, {4} clearAfters\n",
deltas,
deltasSet,
deltasClear,
deltasNoOp,
deltasClearAfter);
}
fmt::print("Logical Size: {0}\n", totalLogicalBytes);
return key.substr(0, len);
}
};
FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames) {
FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames, bool newFormat) {
FileSet files;
CommonPrefixStats stats;
for (int i = 0; i < filenames.size(); i++) {
@ -2701,40 +2803,66 @@ FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filena
std::string fpath = basePath + filenames[i];
Value data = loadFileData(fpath);
Arena arena;
GranuleSnapshot file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
Standalone<GranuleSnapshot> parsed(file, arena);
Standalone<GranuleSnapshot> parsed;
if (!newFormat) {
Arena arena;
GranuleSnapshot file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena);
parsed = Standalone<GranuleSnapshot>(file, arena);
fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
for (auto& it : parsed) {
stats.addKeyValue(it.key, it.value);
}
} else {
Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = loadSnapshotFile(""_sr, data, normalKeys, {});
fmt::print("Loaded {0} rows from snapshot file\n", res.size());
for (auto& it : res) {
stats.addKeyValue(it.key, it.value);
}
}
fmt::print("Loaded {0} rows from snapshot file\n", parsed.size());
files.snapshotFile = { filenames[i], version, data, parsed };
for (auto& it : parsed) {
stats.addKey(it.key);
}
} else {
std::string fpath = basePath + filenames[i];
Value data = loadFileData(fpath);
Arena arena;
GranuleDeltas file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
Standalone<GranuleDeltas> parsed(file, arena);
if (!newFormat) {
Arena arena;
GranuleDeltas file;
ObjectReader dataReader(data.begin(), Unversioned());
dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena);
Standalone<GranuleDeltas> parsed(file, arena);
fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
fmt::print("Loaded {0} deltas from delta file\n", parsed.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
for (auto& it : parsed) {
for (auto& it2 : it.mutations) {
stats.addKey(it2.param1);
if (it2.type == MutationRef::Type::ClearRange) {
stats.addKey(it2.param2);
for (auto& it : parsed) {
for (auto& it2 : it.mutations) {
stats.addKey(it2.param1);
if (it2.type == MutationRef::Type::ClearRange) {
stats.addKey(it2.param2);
}
}
}
} else {
bool startClear = false;
Standalone<VectorRef<ParsedDeltaBoundaryRef>> res =
loadChunkedDeltaFile(""_sr, data, normalKeys, 0, version, {}, startClear);
ASSERT(!startClear);
Standalone<GranuleDeltas> parsed;
fmt::print("Loaded {0} boundaries from delta file\n", res.size());
files.deltaFiles.push_back({ filenames[i], version, data, parsed });
for (auto& it : res) {
stats.addBoundary(it);
}
}
}
stats.doneFile();
}
files.commonPrefix = stats.done();
@ -2792,6 +2920,28 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da
return { serializedBytes, elapsed };
}
void chunkFromFileSet(const FileSet& fileSet,
Standalone<BlobGranuleChunkRef>& chunk,
StringRef* deltaPtrs,
Version readVersion,
Optional<BlobGranuleCipherKeysCtx> keys,
int numDeltaFiles) {
size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
chunk.snapshotFile =
BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
for (int i = 0; i < numDeltaFiles; i++) {
size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
}
chunk.keyRange = fileSet.range;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
}
FileSet rewriteChunkedFileSet(const FileSet& fileSet,
Optional<BlobGranuleCipherKeysCtx> keys,
Optional<CompressionFilter> compressionFilter) {
@ -2818,40 +2968,30 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
KeyRange readRange,
bool clearAllAtEnd,
Optional<BlobGranuleCipherKeysCtx> keys,
Optional<CompressionFilter> compressionFilter) {
int numDeltaFiles,
bool printStats = false) {
Version readVersion = std::get<1>(fileSet.deltaFiles.back());
Standalone<BlobGranuleChunkRef> chunk;
StringRef deltaPtrs[fileSet.deltaFiles.size()];
GranuleMaterializeStats stats;
ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
StringRef deltaPtrs[numDeltaFiles];
MutationRef clearAllAtEndMutation;
if (clearAllAtEnd) {
clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end);
}
if (chunked) {
size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size();
chunk.snapshotFile =
BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys);
for (int i = 0; i < fileSet.deltaFiles.size(); i++) {
size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size();
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys);
deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]);
}
chunkFromFileSet(fileSet, chunk, deltaPtrs, readVersion, keys, numDeltaFiles);
if (clearAllAtEnd) {
readVersion++;
MutationsAndVersionRef lastDelta;
lastDelta.version = readVersion;
lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation);
chunk.includedVersion = readVersion;
chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta);
}
chunk.keyRange = fileSet.range;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile);
}
int64_t serializedBytes = 0;
@ -2875,14 +3015,26 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
}
serializedBytes += actualData.expectedSize();
} else {
RangeResult actualData =
materializeBlobGranule(chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs);
RangeResult actualData = materializeBlobGranule(
chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs, stats);
serializedBytes += actualData.expectedSize();
}
}
elapsed += timer_monotonic();
elapsed /= READ_RUNS;
serializedBytes /= READ_RUNS;
if (printStats) {
fmt::print("Materialize stats:\n");
fmt::print(" Input bytes: {0}\n", stats.inputBytes / READ_RUNS);
fmt::print(" Output bytes: {0}\n", stats.outputBytes / READ_RUNS);
fmt::print(" Write Amp: {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes);
fmt::print(" Snapshot Rows: {0}\n", stats.snapshotRows / READ_RUNS);
fmt::print(" Rows Cleared: {0}\n", stats.rowsCleared / READ_RUNS);
fmt::print(" Rows Inserted: {0}\n", stats.rowsInserted / READ_RUNS);
fmt::print(" Rows Updated: {0}\n", stats.rowsUpdated / READ_RUNS);
}
return { serializedBytes, elapsed };
}
@ -2913,7 +3065,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
int64_t logicalSnapshotSize = 0;
int64_t logicalDeltaSize = 0;
for (auto& it : fileSetNames) {
FileSet fileSet = loadFileSet(basePath, it);
FileSet fileSet = loadFileSet(basePath, it, false);
fileSets.push_back(fileSet);
logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize();
for (auto& deltaFile : fileSet.deltaFiles) {
@ -2944,7 +3096,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
if (encrypt) {
name += "ENC";
}
if (compressionFilter.present()) {
if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
name += "CMP";
}
if (name.empty()) {
@ -3000,9 +3152,16 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
std::vector<std::string> readRunNames = {};
std::vector<std::pair<int64_t, double>> readMetrics;
bool doEdgeCaseReadTests = true;
bool doEdgeCaseReadTests = false;
bool doVaryingDeltaTests = false;
std::vector<double> clearAllReadMetrics;
std::vector<double> readSingleKeyMetrics;
std::vector<std::vector<std::pair<int64_t, double>>> varyingDeltaMetrics;
size_t maxDeltaFiles = 100000;
for (auto& f : fileSets) {
maxDeltaFiles = std::min(maxDeltaFiles, f.deltaFiles.size());
}
for (bool chunk : chunkModes) {
for (bool encrypt : encryptionModes) {
@ -3025,7 +3184,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
if (encrypt) {
name += "ENC";
}
if (compressionFilter.present()) {
if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) {
name += "CMP";
}
if (name.empty()) {
@ -3038,6 +3197,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
double totalElapsed = 0.0;
double totalElapsedClearAll = 0.0;
double totalElapsedSingleKey = 0.0;
std::vector<std::pair<int64_t, double>> varyingDeltas;
for (int i = 0; i <= maxDeltaFiles; i++) {
varyingDeltas.push_back({ 0, 0.0 });
}
for (auto& fileSet : fileSets) {
FileSet newFileSet;
if (!chunk) {
@ -3046,24 +3209,38 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter);
}
auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter);
auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, newFileSet.deltaFiles.size());
totalBytesRead += res.first;
totalElapsed += res.second;
if (doEdgeCaseReadTests) {
totalElapsedClearAll +=
doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second;
doReadBench(newFileSet, chunk, fileSet.range, true, keys, newFileSet.deltaFiles.size())
.second;
Key k = std::get<3>(fileSet.snapshotFile).front().key;
KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k)));
totalElapsedSingleKey +=
doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second;
doReadBench(newFileSet, chunk, singleKeyRange, false, keys, newFileSet.deltaFiles.size())
.second;
}
if (doVaryingDeltaTests && chunk) {
for (int i = 0; i <= maxDeltaFiles; i++) {
auto r = doReadBench(newFileSet, chunk, fileSet.range, false, keys, i);
varyingDeltas[i].first += r.first;
varyingDeltas[i].second += r.second;
}
}
}
readMetrics.push_back({ totalBytesRead, totalElapsed });
if (doEdgeCaseReadTests) {
clearAllReadMetrics.push_back(totalElapsedClearAll);
readSingleKeyMetrics.push_back(totalElapsedSingleKey);
}
if (doVaryingDeltaTests) {
varyingDeltaMetrics.push_back(varyingDeltas);
}
}
}
}
@ -3097,6 +3274,25 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
}
}
if (doVaryingDeltaTests) {
ASSERT(readRunNames.size() == varyingDeltaMetrics.size());
fmt::print("\n\nVarying Deltas Read Results:\nDF#\t");
for (int i = 0; i <= maxDeltaFiles; i++) {
fmt::print("{0}\t", i);
}
fmt::print("\n");
for (int i = 0; i < readRunNames.size(); i++) {
fmt::print("{0}", readRunNames[i]);
for (auto& it : varyingDeltaMetrics[i]) {
double MBperCPUsec = (it.first / 1024.0 / 1024.0) / it.second;
fmt::print("\t{:.6}", MBperCPUsec);
}
fmt::print("\n");
}
}
fmt::print("\n\nCombined Results:\n");
ASSERT(readRunNames.size() == runNames.size() - 1);
for (int i = 0; i < readRunNames.size(); i++) {
@ -3113,3 +3309,22 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") {
return Void();
}
TEST_CASE("!/blobgranule/files/repeatFromFiles") {
std::string basePath = "SET_ME";
std::vector<std::vector<std::string>> fileSetNames = { { "SET_ME" } };
int64_t totalBytesRead = 0;
double totalElapsed = 0.0;
for (auto& it : fileSetNames) {
FileSet fileSet = loadFileSet(basePath, it, true);
auto res = doReadBench(fileSet, true, fileSet.range, false, {}, fileSet.deltaFiles.size(), true);
totalBytesRead += res.first;
totalElapsed += res.second;
}
double MBperCPUsec = (totalBytesRead / 1024.0 / 1024.0) / totalElapsed;
fmt::print("Read Results: {:.6} MB/cpusec\n", MBperCPUsec);
return Void();
}

View File

@ -105,7 +105,9 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
arena.dependsOn(data.arena());
}
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData);
// TODO do something useful with stats?
GranuleMaterializeStats stats;
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData, stats);
} catch (Error& e) {
throw e;

View File

@ -1040,13 +1040,10 @@ private:
Key lastValue;
};
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<Database> cx) {
void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
// Read begin key, if this fails then block was invalid.
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
uint32_t kLen = reader->consumeNetworkUInt32();
const uint8_t* k = reader->consume(kLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
// Read kv pairs and end key
@ -1075,7 +1072,6 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
for (auto b : reader->remainder())
if (b != 0xFF)
throw restore_corrupted_data_padding();
return Void();
}
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1083,7 +1079,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
int len,
Optional<Database> cx) {
state Standalone<StringRef> buf = makeString(len);
int rLen = wait(file->read(mutateString(buf), len, offset));
int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
if (rLen != len)
throw restore_bad_read();
@ -1098,7 +1094,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
wait(decodeKVPairs(&reader, &results, false, cx));
decodeKVPairs(&reader, &results);
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
@ -1121,7 +1117,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
wait(decodeKVPairs(&reader, &results, true, cx));
decodeKVPairs(&reader, &results);
} else {
throw restore_unsupported_file_version();
}
@ -1704,7 +1700,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
state std::unique_ptr<IRangeFileWriter> rangeFile;
state BackupConfig backup(task);
state Arena arena;
state Reference<TenantEntryCache<Void>> tenantCache = makeReference<TenantEntryCache<Void>>(cx);
state Reference<TenantEntryCache<Void>> tenantCache;
// Don't need to check keepRunning(task) here because we will do that while finishing each output file, but
// if bc is false then clearly the backup is no longer in progress
@ -1798,6 +1794,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
// Initialize range file writer and write begin key
if (encryptionEnabled) {
CODE_PROBE(true, "using encrypted snapshot file writer");
if (!tenantCache.isValid()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache->init());
}
rangeFile = std::make_unique<EncryptedRangeFileWriter>(cx, &arena, tenantCache, outFile, blockSize);
} else {
rangeFile = std::make_unique<RangeFileWriter>(outFile, blockSize);

View File

@ -122,6 +122,7 @@ IdempotencyIdRef generate(Arena& arena) {
TEST_CASE("/fdbclient/IdempotencyId/basic") {
Arena arena;
uint16_t firstBatchIndex = deterministicRandom()->randomUInt32();
firstBatchIndex &= 0xff7f; // ensure firstBatchIndex+5 won't change the higher order byte
uint16_t batchIndex = firstBatchIndex;
Version commitVersion = deterministicRandom()->randomInt64(0, std::numeric_limits<Version>::max());
std::vector<IdempotencyIdRef> idVector; // Reference

View File

@ -40,6 +40,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbrpc/TenantInfo.h"
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/AnnotateActor.h"
@ -66,6 +67,7 @@
#include "fdbclient/SpecialKeySpace.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantSpecialKeys.actor.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbclient/versions.h"
@ -687,25 +689,8 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
.detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
.detail("MaxBytesPerCommit", cx->bytesPerCommit.max())
.detail("NumLocalityCacheEntries", cx->locationCache.size());
if (cx->anyBlobGranuleRequests) {
ev.detail("MeanBGLatency", cx->bgLatencies.mean())
.detail("MedianBGLatency", cx->bgLatencies.median())
.detail("MaxBGLatency", cx->bgLatencies.max())
.detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
.detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
.detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
}
}
cx->latencies.clear();
cx->readLatencies.clear();
cx->GRVLatencies.clear();
cx->commitLatencies.clear();
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
cx->bgLatencies.clear();
cx->bgGranulesPerRequest.clear();
if (cx->usedAnyChangeFeeds && logTraces) {
TraceEvent feedEv("ChangeFeedClientMetrics", cx->dbId);
@ -719,6 +704,37 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
cx->ccFeed.logToTraceEvent(feedEv);
}
if (cx->anyBGReads && logTraces) {
TraceEvent bgReadEv("BlobGranuleReadMetrics", cx->dbId);
bgReadEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
.detail("Cluster",
cx->getConnectionRecord()
? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString()
: "")
.detail("Internal", cx->internal);
// add counters
cx->ccBG.logToTraceEvent(bgReadEv);
// add latencies
bgReadEv.detail("MeanBGLatency", cx->bgLatencies.mean())
.detail("MedianBGLatency", cx->bgLatencies.median())
.detail("MaxBGLatency", cx->bgLatencies.max())
.detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
.detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
.detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
}
cx->latencies.clear();
cx->readLatencies.clear();
cx->GRVLatencies.clear();
cx->commitLatencies.clear();
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
cx->bgLatencies.clear();
cx->bgGranulesPerRequest.clear();
lastLogged = now();
}
}
@ -1524,17 +1540,21 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc),
bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"),
feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed),
feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed),
feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000),
commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0),
lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false),
ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), outstandingWatches(0), sharedStatePtr(nullptr),
lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0),
lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo),
clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0),
healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {
@ -1824,14 +1844,17 @@ DatabaseContext::DatabaseContext(const Error& err)
transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc),
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc),
bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"),
feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed),
feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed),
feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000),
commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000),
bgGranulesPerRequest(1000), sharedStatePtr(nullptr), transactionTracingSample(false),
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false),
ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG),
bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG),
bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG),
bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(1000), bgGranulesPerRequest(1000),
usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed),
feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed),
feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed),
feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000),
GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), sharedStatePtr(nullptr),
transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}
// Static constructor used by server processes to create a DatabaseContext
@ -6224,7 +6247,7 @@ ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Referen
trCommitCosts.opsCount++;
keyRange = KeyRangeRef(it->param1, it->param2);
if (trState->options.expensiveClearCostEstimation) {
StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY));
StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY, trState));
trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
trCommitCosts.writeCosts += getWriteOperationCost(m.bytes);
++trCommitCosts.expensiveCostEstCount;
@ -7497,34 +7520,45 @@ Future<Void> Transaction::onError(Error const& e) {
return e;
}
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys);
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
KeyRange keys,
Optional<Reference<TransactionState>> trState);
ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx, KeyRange keys, Reference<LocationInfo> locationInfo) {
loop {
try {
WaitMetricsRequest req(keys, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
StorageMetrics m = wait(loadBalance(
locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(Key(), keys);
StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys));
return m;
ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
KeyRange keys,
Reference<LocationInfo> locationInfo,
TenantMapEntry tenantEntry,
Optional<Reference<TransactionState>> trState) {
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
try {
WaitMetricsRequest req(tenantInfo, keys, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
StorageMetrics m = wait(loadBalance(
locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(tenantEntry.prefix, keys);
StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys, trState));
return m;
}
}
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRange keys) {
ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
KeyRange keys,
Optional<Reference<TransactionState>> trState) {
state Span span("NAPI:GetStorageMetricsLargeKeyRange"_loc);
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations(cx,
TenantInfo(),
tenantInfo,
keys,
std::numeric_limits<int>::max(),
Reverse::False,
@ -7540,7 +7574,8 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRang
for (int i = 0; i < nLocs; i++) {
partBegin = (i == 0) ? keys.begin : locations[i].range.begin;
partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end;
fx[i] = doGetStorageMetrics(cx, KeyRangeRef(partBegin, partEnd), locations[i].locations);
fx[i] = doGetStorageMetrics(
cx, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
}
wait(waitForAll(fx));
for (int i = 0; i < nLocs; i++) {
@ -7549,14 +7584,15 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx, KeyRang
return total;
}
ACTOR Future<Void> trackBoundedStorageMetrics(KeyRange keys,
ACTOR Future<Void> trackBoundedStorageMetrics(TenantInfo tenantInfo,
KeyRange keys,
Reference<LocationInfo> location,
StorageMetrics x,
StorageMetrics halfError,
PromiseStream<StorageMetrics> deltaStream) {
try {
loop {
WaitMetricsRequest req(keys, x - halfError, x + halfError);
WaitMetricsRequest req(tenantInfo, keys, x - halfError, x + halfError);
StorageMetrics nextX = wait(loadBalance(location->locations(), &StorageServerInterface::waitMetrics, req));
deltaStream.send(nextX - x);
x = nextX;
@ -7567,7 +7603,8 @@ ACTOR Future<Void> trackBoundedStorageMetrics(KeyRange keys,
}
}
ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<KeyRangeLocationInfo> locations,
ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(TenantInfo tenantInfo,
std::vector<KeyRangeLocationInfo> locations,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError) {
@ -7581,7 +7618,7 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<Key
state StorageMetrics minMinus = min - halfErrorPerMachine * (nLocs - 1);
for (int i = 0; i < nLocs; i++) {
WaitMetricsRequest req(locations[i].range, StorageMetrics(), StorageMetrics());
WaitMetricsRequest req(tenantInfo, locations[i].range, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
req.max.bytes = -1;
fx[i] = loadBalance(locations[i].locations->locations(),
@ -7602,7 +7639,7 @@ ACTOR Future<StorageMetrics> waitStorageMetricsMultipleLocations(std::vector<Key
for (int i = 0; i < nLocs; i++)
wx[i] = trackBoundedStorageMetrics(
locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas);
tenantInfo, locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas);
loop {
StorageMetrics delta = waitNext(deltas.getFuture());
@ -7687,25 +7724,30 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
}
}
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Database cx,
KeyRange keys,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError,
int shardLimit,
int expectedShardCount) {
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
Database cx,
KeyRange keys,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError,
int shardLimit,
int expectedShardCount,
Optional<Reference<TransactionState>> trState) {
state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample));
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
loop {
std::vector<KeyRangeLocationInfo> locations = wait(getKeyRangeLocations(cx,
TenantInfo(),
keys,
shardLimit,
Reverse::False,
&StorageServerInterface::waitMetrics,
span.context,
Optional<UID>(),
UseProvisionalProxies::False,
latestVersion));
state std::vector<KeyRangeLocationInfo> locations =
wait(getKeyRangeLocations(cx,
tenantInfo,
keys,
shardLimit,
Reverse::False,
&StorageServerInterface::waitMetrics,
span.context,
Optional<UID>(),
UseProvisionalProxies::False,
latestVersion));
if (expectedShardCount >= 0 && locations.size() != expectedShardCount) {
return std::make_pair(Optional<StorageMetrics>(), locations.size());
}
@ -7716,9 +7758,9 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(locations, min, max, permittedError);
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(keys, min, max);
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
@ -7731,7 +7773,7 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
cx->invalidateCache(Key(), keys);
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
} else {
@ -7741,7 +7783,7 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(Databa
.detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
cx->invalidateCache(Key(), keys);
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
}
}
}
@ -7752,17 +7794,21 @@ Future<std::pair<Optional<StorageMetrics>, int>> DatabaseContext::waitStorageMet
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount) {
int expectedShardCount,
Optional<Reference<TransactionState>> trState) {
return ::waitStorageMetrics(Database(Reference<DatabaseContext>::addRef(this)),
keys,
min,
max,
permittedError,
shardLimit,
expectedShardCount);
expectedShardCount,
trState);
}
Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys, int shardLimit) {
Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys,
int shardLimit,
Optional<Reference<TransactionState>> trState) {
if (shardLimit > 0) {
StorageMetrics m;
m.bytes = -1;
@ -7772,9 +7818,10 @@ Future<StorageMetrics> DatabaseContext::getStorageMetrics(KeyRange const& keys,
m,
StorageMetrics(),
shardLimit,
-1));
-1,
trState));
} else {
return ::getStorageMetricsLargeKeyRange(Database(Reference<DatabaseContext>::addRef(this)), keys);
return ::getStorageMetricsLargeKeyRange(Database(Reference<DatabaseContext>::addRef(this)), keys, trState);
}
}
@ -8062,8 +8109,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
}
if (blobGranuleMapping.more) {
if (BG_REQUEST_DEBUG) {
fmt::print(
"BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
fmt::print("BG Mapping for [{0} - {1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable());
}
TraceEvent(SevWarn, "BGMappingTooLarge")
.detail("Range", range)
@ -8276,7 +8322,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
}
}
self->trState->cx->anyBlobGranuleRequests = true;
self->trState->cx->anyBGReads = true;
self->trState->cx->bgGranulesPerRequest.addSample(results.size());
self->trState->cx->bgLatencies.addSample(now() - startTime);
@ -8318,8 +8364,13 @@ Transaction::summarizeBlobGranules(const KeyRange& range, Optional<Version> summ
}
void Transaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) {
trState->cx->anyBGReads = true;
trState->cx->bgReadInputBytes += stats.inputBytes;
trState->cx->bgReadOutputBytes += stats.outputBytes;
trState->cx->bgReadSnapshotRows += stats.snapshotRows;
trState->cx->bgReadRowsCleared += stats.rowsCleared;
trState->cx->bgReadRowsInserted += stats.rowsInserted;
trState->cx->bgReadRowsUpdated += stats.rowsUpdated;
}
ACTOR Future<Version> setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) {

View File

@ -1770,7 +1770,10 @@ Future<int64_t> ReadYourWritesTransaction::getEstimatedRangeSizeBytes(const KeyR
if (resetPromise.isSet())
return resetPromise.getFuture().getError();
return map(waitOrError(tr.getDatabase()->getStorageMetrics(keys, -1), resetPromise.getFuture()),
// Pass in the TransactionState only if tenant is present
Optional<Reference<TransactionState>> trState =
tr.trState->hasTenant() ? tr.trState : Optional<Reference<TransactionState>>();
return map(waitOrError(tr.getDatabase()->getStorageMetrics(keys, -1, trState), resetPromise.getFuture()),
[](const StorageMetrics& m) { return m.bytes; });
}

View File

@ -582,7 +582,8 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
"duplicate_mutation_fetch_timeout",
"primary_dc_missing",
"fetch_primary_dc_timeout",
"fetch_storage_wiggler_stats_timeout"
"fetch_storage_wiggler_stats_timeout",
"fetch_consistency_scan_info_timeout"
]
},
"issues":[

View File

@ -39,11 +39,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENABLE_VERSION_VECTOR, false );
init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false );
bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (buggifyShortReadWindow) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND;
init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
init( MAX_COMMIT_BATCH_INTERVAL, 2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_WRITE_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_WRITE_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
init( MAX_VERSION_RATE_MODIFIER, 0.1 );
init( MAX_VERSION_RATE_OFFSET, VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.
init( ENABLE_VERSION_VECTOR_HA_OPTIMIZATION, false );
@ -296,7 +297,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false );
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
// TeamRemover
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -420,6 +421,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Enable this knob only for experminatal purpose, never enable this in production.
// If enabled, all the committed in-memory memtable writes are lost on a crash.
init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false );
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
// These knobs have contrary functionality.
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -787,7 +792,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true );
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );

View File

@ -145,13 +145,13 @@ Value ThrottleApi::TagQuotaValue::toValue() const {
ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) {
auto tuple = Tuple::unpack(value);
if (tuple.size() != 4) {
if (tuple.size() != 2) {
throw invalid_throttle_quota_value();
}
TagQuotaValue result;
try {
result.reservedQuota = tuple.getDouble(0);
result.totalQuota = tuple.getDouble(1);
result.reservedQuota = tuple.getInt(0);
result.totalQuota = tuple.getInt(1);
} catch (Error& e) {
TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e);
throw invalid_throttle_quota_value();

View File

@ -56,10 +56,18 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
};
struct GranuleMaterializeStats {
// file-level stats
int64_t inputBytes;
int64_t outputBytes;
GranuleMaterializeStats() : inputBytes(0), outputBytes(0) {}
// merge stats
int32_t snapshotRows;
int32_t rowsCleared;
int32_t rowsInserted;
int32_t rowsUpdated;
GranuleMaterializeStats()
: inputBytes(0), outputBytes(0), snapshotRows(0), rowsCleared(0), rowsInserted(0), rowsUpdated(0) {}
};
struct BlobGranuleCipherKeysMeta {

View File

@ -51,7 +51,8 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]);
StringRef deltaFileData[],
GranuleMaterializeStats& stats);
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);

View File

@ -62,8 +62,8 @@ struct BlobMetadataDetailsRef {
BlobMetadataDomainNameRef domainName,
Optional<StringRef> base,
VectorRef<StringRef> partitions,
int64_t refreshAt,
int64_t expireAt)
double refreshAt,
double expireAt)
: domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt),
expireAt(expireAt) {
if (base.present()) {

View File

@ -298,13 +298,19 @@ public:
Future<Void> onProxiesChanged() const;
Future<HealthMetrics> getHealthMetrics(bool detailed);
// Pass a negative value for `shardLimit` to indicate no limit on the shard number.
Future<StorageMetrics> getStorageMetrics(KeyRange const& keys, int shardLimit);
Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount);
// Pass a valid `trState` with `hasTenant() == true` to make the function tenant-aware.
Future<StorageMetrics> getStorageMetrics(
KeyRange const& keys,
int shardLimit,
Optional<Reference<TransactionState>> trState = Optional<Reference<TransactionState>>());
Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount,
Optional<Reference<TransactionState>> trState = Optional<Reference<TransactionState>>());
Future<Void> splitStorageMetricsStream(PromiseStream<Key> const& resultsStream,
KeyRange const& keys,
StorageMetrics const& limit,
@ -548,8 +554,17 @@ public:
Counter transactionGrvFullBatches;
Counter transactionGrvTimedOutBatches;
Counter transactionCommitVersionNotFoundForSS;
// Blob Granule Read metrics. Omit from logging if not used.
bool anyBGReads;
CounterCollection ccBG;
Counter bgReadInputBytes;
Counter bgReadOutputBytes;
Counter bgReadSnapshotRows;
Counter bgReadRowsCleared;
Counter bgReadRowsInserted;
Counter bgReadRowsUpdated;
ContinuousSample<double> bgLatencies, bgGranulesPerRequest;
// Change Feed metrics. Omit change feed metrics from logging if not used
bool usedAnyChangeFeeds;
@ -562,7 +577,7 @@ public:
Counter feedPopsFallback;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
bytesPerCommit, bgLatencies, bgGranulesPerRequest;
bytesPerCommit;
int outstandingWatches;
int maxOutstandingWatches;
@ -591,7 +606,6 @@ public:
bool transactionTracingSample;
double verifyCausalReadsProp = 0.0;
bool blobGranuleNoMaterialize = false;
bool anyBlobGranuleRequests = false;
Future<Void> logger;
Future<Void> throttleExpirer;

View File

@ -1402,6 +1402,25 @@ struct TenantMode {
serializer(ar, mode);
}
// This does not go back-and-forth cleanly with toString
// The '_experimental' suffix, if present, needs to be removed in order to be parsed.
static TenantMode fromString(std::string mode) {
if (mode.find("_experimental") != std::string::npos) {
mode.replace(mode.find("_experimental"), std::string::npos, "");
}
if (mode == "disabled") {
return TenantMode::DISABLED;
} else if (mode == "optional") {
return TenantMode::OPTIONAL_TENANT;
} else if (mode == "required") {
return TenantMode::REQUIRED;
} else {
TraceEvent(SevError, "UnknownTenantMode").detail("TenantMode", mode);
ASSERT(false);
throw internal_error();
}
}
std::string toString() const {
switch (mode) {
case DISABLED:
@ -1686,10 +1705,20 @@ struct Versionstamp {
serializer(ar, beVersion, beBatch);
if constexpr (Ar::isDeserializing) {
version = bigEndian64(version);
version = bigEndian64(beVersion);
batchNumber = bigEndian16(beBatch);
}
}
};
template <class Ar>
inline void save(Ar& ar, const Versionstamp& value) {
return const_cast<Versionstamp&>(value).serialize(ar);
}
template <class Ar>
inline void load(Ar& ar, Versionstamp& value) {
value.serialize(ar);
}
#endif

View File

@ -104,6 +104,11 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL
// Collect cached cipher keys.
for (auto& domain : domains) {
if (domain.first == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(domain.second == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (domain.first == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(domain.second == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
Reference<BlobCipherKey> cachedCipherKey = cipherKeyCache->getLatestCipherKey(domain.first /*domainId*/);
if (cachedCipherKey.isValid()) {
cipherKeys[domain.first] = cachedCipherKey;
@ -301,7 +306,7 @@ template <class T>
Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db,
BlobCipherMetrics::UsageType usageType) {
return getLatestEncryptCipherKeysForDomain(
db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType);
db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME, usageType);
}
ACTOR template <class T>

View File

@ -319,6 +319,11 @@ public:
tr->clear(key);
}
template <class Transaction>
Future<Void> watch(Transaction tr) {
return tr->watch(key);
}
Key key;
};

View File

@ -502,6 +502,7 @@ Future<Void> decommissionMetacluster(Reference<DB> db) {
ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr);
ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr);
ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.clear(tr);
wait(managementClusterCheckEmpty(tr));
MetaclusterMetadata::metaclusterRegistration().clear(tr);
@ -797,6 +798,7 @@ struct RemoveClusterImpl {
ASSERT(entry.getString(0) == self->ctx.clusterName.get());
ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1));
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2));
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
}
// Erase all of the tenants processed in this transaction from the cluster tenant index
@ -1262,6 +1264,7 @@ struct CreateTenantImpl {
self->tenantEntry.tenantState = TenantState::REGISTERING;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry);
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
ManagementClusterMetadata::clusterTenantCount.atomicOp(
@ -1317,6 +1320,7 @@ struct CreateTenantImpl {
TenantMapEntry updatedEntry = managementEntry.get();
updatedEntry.tenantState = TenantState::READY;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
}
return Void();
@ -1446,6 +1450,7 @@ struct DeleteTenantImpl {
}
updatedEntry.tenantState = TenantState::REMOVING;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// If this has a rename pair, also mark the other entry for deletion
if (self->pairName.present()) {
state Optional<TenantMapEntry> pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get()));
@ -1457,6 +1462,8 @@ struct DeleteTenantImpl {
CODE_PROBE(true, "marking pair tenant in removing state");
updatedPairEntry.tenantState = TenantState::REMOVING;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(
tr, Versionstamp(), 0);
}
}
@ -1485,6 +1492,7 @@ struct DeleteTenantImpl {
// Erase the tenant entry itself
ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName);
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// This is idempotent because this function is only called if the tenant is in the map
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
@ -1689,6 +1697,7 @@ struct ConfigureTenantImpl {
++self->updatedEntry.configurationSequenceNum;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
return Void();
}
@ -1724,6 +1733,7 @@ struct ConfigureTenantImpl {
tenantEntry.get().tenantState = TenantState::READY;
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get());
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
return Void();
}
@ -1770,6 +1780,7 @@ struct RenameTenantImpl {
TenantMapEntry tenantEntry) {
// Erase the tenant entry itself
ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// Remove old tenant from tenant count
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue);
@ -1857,6 +1868,7 @@ struct RenameTenantImpl {
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry);
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
// Add temporary tenant to tenantCount to prevent exceeding capacity during a rename
ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue);
@ -1919,6 +1931,7 @@ struct RenameTenantImpl {
updatedNewEntry.renamePair.reset();
ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry);
ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName);
ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0);
}
// We will remove the old entry from the management cluster

View File

@ -237,6 +237,8 @@ public:
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED;
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
// refreshed
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -343,6 +345,8 @@ public:
int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
int64_t ROCKSDB_BLOCK_SIZE;
bool ENABLE_SHARDED_ROCKSDB;

View File

@ -103,7 +103,7 @@ struct StorageServerInterface {
PublicRequestStream<struct GetMappedKeyValuesRequest> getMappedKeyValues;
RequestStream<struct GetShardStateRequest> getShardState;
RequestStream<struct WaitMetricsRequest> waitMetrics;
PublicRequestStream<struct WaitMetricsRequest> waitMetrics;
RequestStream<struct SplitMetricsRequest> splitMetrics;
RequestStream<struct GetStorageMetricsRequest> getStorageMetrics;
RequestStream<ReplyPromise<Void>> waitFailure;
@ -161,7 +161,8 @@ public:
PublicRequestStream<struct GetKeyValuesRequest>(getValue.getEndpoint().getAdjustedEndpoint(2));
getShardState =
RequestStream<struct GetShardStateRequest>(getValue.getEndpoint().getAdjustedEndpoint(3));
waitMetrics = RequestStream<struct WaitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(4));
waitMetrics =
PublicRequestStream<struct WaitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(4));
splitMetrics = RequestStream<struct SplitMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(5));
getStorageMetrics =
RequestStream<struct GetStorageMetricsRequest>(getValue.getEndpoint().getAdjustedEndpoint(6));
@ -713,18 +714,25 @@ struct WaitMetricsRequest {
// Waits for any of the given minimum or maximum metrics to be exceeded, and then returns the current values
// Send a reversed range for min, max to receive an immediate report
constexpr static FileIdentifier file_identifier = 1795961;
// Setting the tenantInfo makes the request tenant-aware.
Optional<TenantInfo> tenantInfo;
Arena arena;
KeyRangeRef keys;
StorageMetrics min, max;
ReplyPromise<StorageMetrics> reply;
bool verify() const { return tenantInfo.present() && tenantInfo.get().isAuthorized(); }
WaitMetricsRequest() {}
WaitMetricsRequest(KeyRangeRef const& keys, StorageMetrics const& min, StorageMetrics const& max)
: keys(arena, keys), min(min), max(max) {}
WaitMetricsRequest(TenantInfo tenantInfo,
KeyRangeRef const& keys,
StorageMetrics const& min,
StorageMetrics const& max)
: tenantInfo(tenantInfo), keys(arena, keys), min(min), max(max) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keys, min, max, reply, arena);
serializer(ar, keys, min, max, reply, tenantInfo, arena);
}
};

View File

@ -597,8 +597,8 @@ Future<Void> enableAuto(Reference<DB> db, bool enabled) {
class TagQuotaValue {
public:
double reservedQuota{ 0.0 };
double totalQuota{ 0.0 };
int64_t reservedQuota{ 0 };
int64_t totalQuota{ 0 };
bool isValid() const;
Value toValue() const;
static TagQuotaValue fromValue(ValueRef);

View File

@ -181,6 +181,7 @@ struct TenantMetadataSpecification {
KeyBackedObjectProperty<TenantTombstoneCleanupData, decltype(IncludeVersion())> tombstoneCleanupData;
KeyBackedSet<Tuple> tenantGroupTenantIndex;
KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap;
KeyBackedBinaryValue<Versionstamp> lastTenantModification;
TenantMetadataSpecification(KeyRef prefix)
: subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()),
@ -188,7 +189,8 @@ struct TenantMetadataSpecification {
tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)),
tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()),
tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)),
tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {}
tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()),
lastTenantModification(subspace.withSuffix("lastModification"_sr)) {}
};
struct TenantMetadata {
@ -203,6 +205,7 @@ struct TenantMetadata {
static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; }
static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; }
static inline auto& tenantGroupMap() { return instance().tenantGroupMap; }
static inline auto& lastTenantModification() { return instance().lastTenantModification; }
static Key tenantMapPrivatePrefix();
};

View File

@ -44,8 +44,14 @@
using TenantNameEntryPair = std::pair<TenantName, TenantMapEntry>;
using TenantNameEntryPairVec = std::vector<TenantNameEntryPair>;
enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 };
enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 };
enum class TenantEntryCacheRefreshReason {
INIT = 1,
PERIODIC_TASK = 2,
CACHE_MISS = 3,
REMOVE_ENTRY = 4,
WATCH_TRIGGER = 5
};
enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, WATCH = 2, NONE = 3 };
template <class T>
struct TenantEntryCachePayload {
@ -62,12 +68,6 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
// 1. Lookup by 'TenantId'
// 2. Lookup by 'TenantPrefix'
// 3. Lookup by 'TenantName'
//
// TODO:
// ----
// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic
// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement
// database range-watch to monitor such updates
template <class T>
class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {
@ -78,6 +78,10 @@ private:
TenantEntryCacheRefreshMode refreshMode;
Future<Void> refresher;
Future<Void> watchRefresher;
Future<Void> lastTenantIdRefresher;
Promise<Void> setInitialWatch;
Optional<int64_t> lastTenantId;
Map<int64_t, TenantEntryCachePayload<T>> mapByTenantId;
Map<TenantName, TenantEntryCachePayload<T>> mapByTenantName;
@ -87,6 +91,7 @@ private:
Counter refreshByCacheInit;
Counter refreshByCacheMiss;
Counter numRefreshes;
Counter refreshByWatchTrigger;
ACTOR static Future<TenantNameEntryPairVec> getTenantList(Reference<ReadYourWritesTransaction> tr) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
@ -102,16 +107,166 @@ private:
return tenantList.results;
}
ACTOR static Future<Void> refreshCacheById(int64_t tenantId,
TenantEntryCache<T>* cache,
TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheIDRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
state Optional<TenantName> name = wait(TenantMetadata::tenantIdIndex().get(tr, tenantId));
if (name.present()) {
Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name.get()));
if (entry.present()) {
cache->put(std::make_pair(name.get(), entry.get()));
updateCacheRefreshMetrics(cache, reason);
}
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
TraceEvent(SevDebug, "TenantEntryCacheIDRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
return Void();
}
ACTOR static Future<Void> refreshCacheByName(TenantName name,
TenantEntryCache<T>* cache,
TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheNameRefreshStart", cache->id())
.detail("Reason", static_cast<int>(reason));
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name));
if (entry.present()) {
cache->put(std::make_pair(name, entry.get()));
updateCacheRefreshMetrics(cache, reason);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
TraceEvent(SevDebug, "TenantEntryCacheNameRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason));
return Void();
}
static void updateCacheRefreshMetrics(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
if (reason == TenantEntryCacheRefreshReason::INIT) {
cache->refreshByCacheInit += 1;
} else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) {
cache->refreshByCacheMiss += 1;
} else if (reason == TenantEntryCacheRefreshReason::WATCH_TRIGGER) {
cache->refreshByWatchTrigger += 1;
}
cache->numRefreshes += 1;
}
ACTOR static Future<Void> refreshCacheUsingWatch(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchStart", cache->id())
.detail("Reason", static_cast<int>(reason));
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Future<Void> tenantModifiedWatch = TenantMetadata::lastTenantModification().watch(tr);
wait(tr->commit());
TraceEvent(SevDebug, "TenantEntryCacheRefreshWatchSet", cache->id());
// setInitialWatch is set to indicate that an inital watch has been set for the lastTenantModification
// key. Currently this is only used in simulation to avoid a race condition where a tenant is created
// before the inital watch is set. However, it can be enabled by passing waitForInitalWatch = true to
// the init() method.
if (cache->setInitialWatch.canBeSet()) {
cache->setInitialWatch.send(Void());
}
wait(tenantModifiedWatch);
// If watch triggered then refresh the cache as tenant metadata was updated
TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchTriggered", cache->id())
.detail("Reason", static_cast<int>(reason));
wait(refreshImpl(cache, reason));
tr->reset();
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent("TenantEntryCacheRefreshUsingWatchError", cache->id())
.errorUnsuppressed(e)
.suppressFor(1.0);
}
wait(tr->onError(e));
// In case the watch threw an error then refresh the cache just in case it was updated
wait(refreshImpl(cache, reason));
}
}
}
static bool tenantsEnabled(TenantEntryCache<T>* cache) {
// Avoid using the cache if the tenant mode is disabled. However since we use clientInfo, sometimes it may not
// be fully up to date (i.e it may indicate the tenantMode is disabled when in fact it is required). Thus if
// there is at least one tenant that has been created on the cluster then use the cache to avoid an incorrect
// miss.
if (cache->getDatabase()->clientInfo->get().tenantMode == TenantMode::DISABLED) {
if (!cache->lastTenantId.present()) {
return false;
}
return cache->lastTenantId.get() > 0;
}
return true;
}
ACTOR static Future<Void> setLastTenantId(TenantEntryCache<T>* cache) {
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<int64_t> lastTenantId = wait(TenantMetadata::lastTenantId().get(tr));
cache->lastTenantId = lastTenantId;
return Void();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR static Future<Void> lastTenantIdWatch(TenantEntryCache<T>* cache) {
TraceEvent(SevDebug, "TenantEntryCacheLastTenantIdWatchStart", cache->id());
// monitor for any changes on the last tenant id and update it as necessary
state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Future<Void> lastTenantIdWatch = tr->watch(TenantMetadata::lastTenantId().key);
wait(tr->commit());
wait(lastTenantIdWatch);
wait(setLastTenantId(cache));
tr->reset();
} catch (Error& e) {
state Error err(e);
if (err.code() != error_code_actor_cancelled) {
TraceEvent("TenantEntryCacheLastTenantIdWatchError", cache->id())
.errorUnsuppressed(err)
.suppressFor(1.0);
// In case watch errors out refresh the lastTenantId in case it has changed or we would have missed
// an update
wait(setLastTenantId(cache));
}
wait(tr->onError(err));
}
}
}
ACTOR static Future<Void> refreshImpl(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) {
TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason));
@ -130,9 +285,7 @@ private:
break;
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id())
.errorUnsuppressed(e)
.suppressFor(1.0);
TraceEvent("TenantEntryCacheRefreshError", cache->id()).errorUnsuppressed(e).suppressFor(1.0);
}
wait(tr->onError(e));
}
@ -151,12 +304,22 @@ private:
return ret;
}
TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
if (!tenantsEnabled(cache)) {
// If tenants are disabled on the cluster avoid using the cache
return Optional<TenantEntryCachePayload<T>>();
}
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
TraceEvent("TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId);
if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) {
// Entry not found. Do a point refresh
// TODO: Don't initiate refresh if tenantId < maxTenantId (stored as a system key currently) as we know that
// such a tenant does not exist (it has either never existed or has been deleted)
wait(refreshCacheById(tenantId, cache, TenantEntryCacheRefreshReason::CACHE_MISS));
} else {
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
}
cache->misses += 1;
return cache->lookupById(tenantId);
@ -170,12 +333,20 @@ private:
return ret;
}
if (!tenantsEnabled(cache)) {
// If tenants are disabled on the cluster avoid using the cache
return Optional<TenantEntryCachePayload<T>>();
}
TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name);
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
// TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any
// existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) {
// Entry not found. Do a point refresh
wait(refreshCacheByName(name, cache, TenantEntryCacheRefreshReason::CACHE_MISS));
} else {
// Entry not found. Refresh cacheEntries by scanning underlying KeyRange.
wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS));
}
cache->misses += 1;
return cache->lookupByName(name);
@ -272,7 +443,18 @@ public:
hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
}
TenantEntryCache(Database db, TenantEntryCacheRefreshMode mode)
: uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload),
refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid);
}
@ -282,7 +464,8 @@ public:
hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreated", uid);
}
@ -291,7 +474,8 @@ public:
metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics),
misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreated", uid);
}
@ -300,26 +484,36 @@ public:
hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics),
refreshByCacheInit("TenantEntryCacheRefreshInit", metrics),
refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics),
numRefreshes("TenantEntryCacheNumRefreshes", metrics) {
numRefreshes("TenantEntryCacheNumRefreshes", metrics),
refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) {
TraceEvent("TenantEntryCacheCreated", uid);
}
Future<Void> init() {
Future<Void> init(bool waitForInitalWatch = false) {
TraceEvent("TenantEntryCacheInit", uid);
Future<Void> f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT);
// Launch reaper task to periodically refresh cache by scanning database KeyRange
TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK;
Future<Void> initalWatchFuture = Void();
lastTenantIdRefresher = lastTenantIdWatch(this);
if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) {
refresher = recurringAsync([&, reason]() { return refresh(reason); },
CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* interval */
true, /* absoluteIntervalDelay */
CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */
TaskPriority::Worker);
} else if (refreshMode == TenantEntryCacheRefreshMode::WATCH) {
if (waitForInitalWatch) {
initalWatchFuture = setInitialWatch.getFuture();
}
watchRefresher = refreshCacheUsingWatch(this, TenantEntryCacheRefreshReason::WATCH_TRIGGER);
}
return f;
Future<Void> setLastTenant = setLastTenantId(this);
return f && initalWatchFuture && setLastTenant;
}
Database getDatabase() const { return db; }
@ -341,28 +535,33 @@ public:
}
void put(const TenantNameEntryPair& pair) {
TenantEntryCachePayload<T> payload = createPayloadFunc(pair.first, pair.second);
auto idItr = mapByTenantId.find(pair.second.id);
auto nameItr = mapByTenantName.find(pair.first);
const auto& [name, entry] = pair;
TenantEntryCachePayload<T> payload = createPayloadFunc(name, entry);
auto idItr = mapByTenantId.find(entry.id);
auto nameItr = mapByTenantName.find(name);
Optional<TenantName> existingName;
Optional<int64_t> existingId;
if (nameItr != mapByTenantName.end()) {
existingId = nameItr->value.entry.id;
mapByTenantId.erase(nameItr->value.entry.id);
}
if (idItr != mapByTenantId.end()) {
existingName = idItr->value.name;
mapByTenantName.erase(idItr->value.name);
}
if (existingId.present()) {
mapByTenantId.erase(existingId.get());
}
if (existingName.present()) {
mapByTenantName.erase(existingName.get());
}
mapByTenantId[pair.second.id] = payload;
mapByTenantName[pair.first] = payload;
mapByTenantId[entry.id] = payload;
mapByTenantName[name] = payload;
TraceEvent("TenantEntryCachePut")
.detail("TenantName", pair.first)
.detail("TenantName", name)
.detail("TenantNameExisting", existingName)
.detail("TenantID", pair.second.id)
.detail("TenantID", entry.id)
.detail("TenantIDExisting", existingId)
.detail("TenantPrefix", pair.second.prefix);
@ -384,7 +583,8 @@ public:
Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); }
Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); }
Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); }
Counter::Value numWatchRefreshes() const { return refreshByWatchTrigger.getValue(); }
};
#include "flow/unactorcompiler.h"
#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H
#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H

View File

@ -178,6 +178,7 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(
TenantMetadata::tenantMap().set(tr, name, tenantEntry);
TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
if (tenantEntry.tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name));
@ -346,6 +347,7 @@ Future<Void> deleteTenantTransaction(Transaction tr,
TenantMetadata::tenantMap().erase(tr, name);
TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id);
TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
if (tenantEntry.get().tenantGroup.present()) {
TenantMetadata::tenantGroupTenantIndex().erase(tr,
@ -420,6 +422,7 @@ Future<Void> configureTenantTransaction(Transaction tr,
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
// If the tenant group was changed, we need to update the tenant group metadata structures
if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) {
@ -523,6 +526,7 @@ Future<Void> renameTenantTransaction(Transaction tr,
TenantMetadata::tenantMap().erase(tr, oldName);
TenantMetadata::tenantMap().set(tr, newName, oldEntry.get());
TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName);
TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0);
// Update the tenant group index to reflect the new tenant name
if (oldEntry.get().tenantGroup.present()) {

View File

@ -202,8 +202,9 @@ description is not currently required but encouraged.
description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
defaultFor="23"/>
<Option name="transaction_automatic_idempotency" code="506"
description="Set a random idempotency id for all transactions. See the transaction option description for more information."
defaultFor="505"/>
description="Set a random idempotency id for all transactions. See the transaction option description for more information. This feature is in development and not ready for general use."
defaultFor="505"
hidden="true"/>
<Option name="transaction_bypass_unreadable" code="700"
description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."
defaultFor="1100"/>
@ -278,9 +279,11 @@ description is not currently required but encouraged.
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="idempotency_id" code="504"
paramType="String" paramDescription="Unique ID"
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes." />
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
hidden="true" />
<Option name="automatic_idempotency" code="505"
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future." />
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."
hidden="true" />
<Option name="snapshot_ryw_enable" code="600"
description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
<Option name="snapshot_ryw_disable" code="601"

View File

@ -48,15 +48,17 @@ public:
ACTOR static Future<Standalone<StringRef>> readBlock(AsyncFileEncrypted* self, uint32_t block) {
state Arena arena;
state unsigned char* encrypted = new (arena) unsigned char[FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE];
int bytes = wait(
self->file->read(encrypted, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block));
int bytes = wait(uncancellable(holdWhile(arena,
self->file->read(encrypted,
FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE,
FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block))));
StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey();
DecryptionStreamCipher decryptor(cipherKey, self->getIV(block));
auto decrypted = decryptor.decrypt(encrypted, bytes, arena);
return Standalone<StringRef>(decrypted, arena);
}
ACTOR static Future<int> read(AsyncFileEncrypted* self, void* data, int length, int64_t offset) {
ACTOR static Future<int> read(Reference<AsyncFileEncrypted> self, void* data, int length, int64_t offset) {
state const uint32_t firstBlock = offset / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
state const uint32_t lastBlock = (offset + length - 1) / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE;
state uint32_t block;
@ -70,7 +72,7 @@ public:
if (cachedBlock.present()) {
plaintext = cachedBlock.get();
} else {
wait(store(plaintext, readBlock(self, block)));
wait(store(plaintext, readBlock(self.getPtr(), block)));
self->readBuffers.insert(block, plaintext);
}
auto start = (block == firstBlock) ? plaintext.begin() + (offset % FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)
@ -96,7 +98,7 @@ public:
return bytesRead;
}
ACTOR static Future<Void> write(AsyncFileEncrypted* self, void const* data, int length, int64_t offset) {
ACTOR static Future<Void> write(Reference<AsyncFileEncrypted> self, void const* data, int length, int64_t offset) {
ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
// All writes must append to the end of the file:
ASSERT_EQ(offset, self->currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE + self->offsetInBlock);
@ -122,7 +124,7 @@ public:
return Void();
}
ACTOR static Future<Void> sync(AsyncFileEncrypted* self) {
ACTOR static Future<Void> sync(Reference<AsyncFileEncrypted> self) {
ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY);
wait(self->writeLastBlockToFile());
wait(self->file->sync());
@ -135,7 +137,7 @@ public:
Arena arena;
auto zeroes = new (arena) unsigned char[length];
memset(zeroes, 0, length);
wait(self->write(zeroes, length, offset));
wait(uncancellable(holdWhile(arena, self->write(zeroes, length, offset))));
return Void();
}
};
@ -159,11 +161,11 @@ void AsyncFileEncrypted::delref() {
}
Future<int> AsyncFileEncrypted::read(void* data, int length, int64_t offset) {
return AsyncFileEncryptedImpl::read(this, data, length, offset);
return AsyncFileEncryptedImpl::read(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
}
Future<Void> AsyncFileEncrypted::write(void const* data, int length, int64_t offset) {
return AsyncFileEncryptedImpl::write(this, data, length, offset);
return AsyncFileEncryptedImpl::write(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset);
}
Future<Void> AsyncFileEncrypted::zeroRange(int64_t offset, int64_t length) {
@ -177,7 +179,7 @@ Future<Void> AsyncFileEncrypted::truncate(int64_t size) {
Future<Void> AsyncFileEncrypted::sync() {
ASSERT(mode == Mode::APPEND_ONLY);
return AsyncFileEncryptedImpl::sync(this);
return AsyncFileEncryptedImpl::sync(Reference<AsyncFileEncrypted>::addRef(this));
}
Future<Void> AsyncFileEncrypted::flush() {
@ -217,7 +219,11 @@ StreamCipher::IV AsyncFileEncrypted::getIV(uint32_t block) const {
}
Future<Void> AsyncFileEncrypted::writeLastBlockToFile() {
return file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE);
// The source buffer for the write is owned by *this so this must be kept alive by reference count until the write
// is finished.
return uncancellable(
holdWhile(Reference<AsyncFileEncrypted>::addRef(this),
file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE)));
}
size_t AsyncFileEncrypted::RandomCache::evict() {

View File

@ -72,8 +72,9 @@ public:
// Wait for diskDelay before submitting the I/O
// Template types are being provided explicitly because they can't be automatically deduced for some reason.
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<int>(Void)>, int>(
delay(diskDelay), [=](Void _) -> Future<int> { return file->read(data, length, offset); });
delay(diskDelay), [=, file = file](Void _) -> Future<int> { return file->read(data, length, offset); });
}
Future<Void> write(void const* data, int length, int64_t offset) override {
@ -102,9 +103,9 @@ public:
.log();
// increment the metric for bit flips
auto res = g_network->global(INetwork::enChaosMetrics);
if (res) {
ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(res);
auto chaosMetricsPointer = g_network->global(INetwork::enChaosMetrics);
if (chaosMetricsPointer) {
ChaosMetrics* chaosMetrics = static_cast<ChaosMetrics*>(chaosMetricsPointer);
chaosMetrics->bitFlips++;
}
}
@ -112,28 +113,30 @@ public:
}
// Wait for diskDelay before submitting the I/O
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(delay(getDelay()), [=](Void _) -> Future<Void> {
if (pdata) {
// if (g_network->isSimulated())
return map(holdWhile(arena, file->write(pdata, length, offset)), [corruptedBlock, this](auto res) {
if (g_network->isSimulated()) {
g_simulator->corruptedBlocks.template emplace(file->getFilename(), corruptedBlock);
}
return res;
});
}
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(getDelay()), [=, file = file](Void _) -> Future<Void> {
if (pdata) {
return map(
holdWhile(arena, file->write(pdata, length, offset)), [corruptedBlock, file = file](auto res) {
if (g_network->isSimulated()) {
g_simulator->corruptedBlocks.template emplace(file->getFilename(), corruptedBlock);
}
return res;
});
}
return map(file->write(data, length, offset), [this, pdata, offset, length](auto res) {
if (pdata != nullptr || !g_network->isSimulated()) {
return res;
}
g_simulator->corruptedBlocks.erase(
g_simulator->corruptedBlocks.lower_bound(std::make_pair(file->getFilename(), offset / 4096)),
g_simulator->corruptedBlocks.upper_bound(
std::make_pair(file->getFilename(), (offset + length) / 4096)));
return res;
});
});
return map(file->write(data, length, offset), [this, pdata, offset, length, file = file](auto res) {
if (pdata != nullptr || !g_network->isSimulated()) {
return res;
}
g_simulator->corruptedBlocks.erase(
g_simulator->corruptedBlocks.lower_bound(std::make_pair(file->getFilename(), offset / 4096)),
g_simulator->corruptedBlocks.upper_bound(
std::make_pair(file->getFilename(), (offset + length) / 4096)));
return res;
});
});
}
Future<Void> truncate(int64_t size) override {
@ -142,8 +145,9 @@ public:
return file->truncate(size);
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [this, size](Void _) -> Future<Void> {
delay(diskDelay), [this, size, file = file](Void _) -> Future<Void> {
constexpr auto maxBlockValue =
std::numeric_limits<decltype(g_simulator->corruptedBlocks)::key_type::second_type>::max();
auto firstDeletedBlock =
@ -161,8 +165,9 @@ public:
return file->sync();
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
delay(diskDelay), [=](Void _) -> Future<Void> { return file->sync(); });
delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->sync(); });
}
Future<int64_t> size() const override {
@ -171,8 +176,9 @@ public:
return file->size();
// Wait for diskDelay before submitting the I/O
// Capture file by value in case this is destroyed during the delay
return mapAsync<Void, std::function<Future<int64_t>(Void)>, int64_t>(
delay(diskDelay), [=](Void _) -> Future<int64_t> { return file->size(); });
delay(diskDelay), [=, file = file](Void _) -> Future<int64_t> { return file->size(); });
}
int64_t debugFD() const override { return file->debugFD(); }

View File

@ -46,12 +46,17 @@ ACTOR Future<Void> sendErrorOnProcess(ISimulator::ProcessInfo* process,
TaskPriority taskID);
ACTOR template <class T>
Future<T> sendErrorOnShutdown(Future<T> in) {
choose {
when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
throw io_error().asInjectedFault();
Future<T> sendErrorOnShutdown(Future<T> in, bool assertOnCancel = false) {
try {
choose {
when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) {
throw io_error().asInjectedFault();
}
when(T rep = wait(in)) { return rep; }
}
when(T rep = wait(in)) { return rep; }
} catch (Error& e) {
ASSERT(e.code() != error_code_actor_cancelled || !assertOnCancel);
throw;
}
}
@ -59,9 +64,12 @@ class AsyncFileDetachable final : public IAsyncFile, public ReferenceCounted<Asy
private:
Reference<IAsyncFile> file;
Future<Void> shutdown;
bool assertOnReadWriteCancel;
public:
explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file) { shutdown = doShutdown(this); }
explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file), assertOnReadWriteCancel(true) {
shutdown = doShutdown(this);
}
ACTOR Future<Void> doShutdown(AsyncFileDetachable* self) {
wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()));
@ -84,13 +92,13 @@ public:
Future<int> read(void* data, int length, int64_t offset) override {
if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
return io_error().asInjectedFault();
return sendErrorOnShutdown(file->read(data, length, offset));
return sendErrorOnShutdown(file->read(data, length, offset), assertOnReadWriteCancel);
}
Future<Void> write(void const* data, int length, int64_t offset) override {
if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady())
return io_error().asInjectedFault();
return sendErrorOnShutdown(file->write(data, length, offset));
return sendErrorOnShutdown(file->write(data, length, offset), assertOnReadWriteCancel);
}
Future<Void> truncate(int64_t size) override {

View File

@ -52,7 +52,7 @@ public:
state Reference<CacheBlock> block(new CacheBlock(length));
try {
int len = wait(f->m_f->read(block->data, length, offset));
int len = wait(uncancellable(holdWhile(block, f->m_f->read(block->data, length, offset))));
block->len = len;
} catch (Error& e) {
f->m_max_concurrent_reads.release(1);

View File

@ -32,14 +32,18 @@ public:
// For read() and write(), the data buffer must remain valid until the future is ready
Future<int> read(void* data, int length, int64_t offset) override {
return map(m_f->read(data, length, offset), [=](int r) {
updateChecksumHistory(false, offset, r, (uint8_t*)data);
// Lambda must hold a reference to this to keep it alive until after the read
auto self = Reference<AsyncFileWriteChecker>::addRef(this);
return map(m_f->read(data, length, offset), [self, data, offset](int r) {
self->updateChecksumHistory(false, offset, r, (uint8_t*)data);
return r;
});
}
Future<Void> readZeroCopy(void** data, int* length, int64_t offset) override {
return map(m_f->readZeroCopy(data, length, offset), [=](Void r) {
updateChecksumHistory(false, offset, *length, (uint8_t*)data);
// Lambda must hold a reference to this to keep it alive until after the read
auto self = Reference<AsyncFileWriteChecker>::addRef(this);
return map(m_f->readZeroCopy(data, length, offset), [self, data, length, offset](Void r) {
self->updateChecksumHistory(false, offset, *length, (uint8_t*)data);
return r;
});
}
@ -50,12 +54,14 @@ public:
}
Future<Void> truncate(int64_t size) override {
return map(m_f->truncate(size), [=](Void r) {
// Lambda must hold a reference to this to keep it alive until after the read
auto self = Reference<AsyncFileWriteChecker>::addRef(this);
return map(m_f->truncate(size), [self, size](Void r) {
// Truncate the page checksum history if it is in use
if ((size / checksumHistoryPageSize) < checksumHistory.size()) {
int oldCapacity = checksumHistory.capacity();
checksumHistory.resize(size / checksumHistoryPageSize);
checksumHistoryBudget.get() -= (checksumHistory.capacity() - oldCapacity);
if ((size / checksumHistoryPageSize) < self->checksumHistory.size()) {
int oldCapacity = self->checksumHistory.capacity();
self->checksumHistory.resize(size / checksumHistoryPageSize);
checksumHistoryBudget.get() -= (self->checksumHistory.capacity() - oldCapacity);
}
return r;
});

View File

@ -239,7 +239,7 @@ public:
// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
void addEndpoints(std::vector<std::pair<class FlowReceiver*, TaskPriority>> const& streams);
// The given local endpoint no longer delivers messages to the given receiver or uses resources
void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);

View File

@ -42,6 +42,8 @@ struct TenantInfo {
// Is set during deserialization. It will be set to true if the tenant
// name is set and the client is authorized to use this tenant.
bool tenantAuthorized = false;
// Number of storage bytes currently used by this tenant.
int64_t storageUsage = 0;
// Helper function for most endpoints that read/write data. This returns true iff
// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,

View File

@ -28,9 +28,14 @@
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/networksender.actor.h"
struct FlowReceiver : public NetworkMessageReceiver {
// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
// Common endpoint code for NetSAV<> and NetNotifiedQueue<>
class FlowReceiver : public NetworkMessageReceiver, public NonCopyable {
Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
Endpoint endpoint;
bool m_isLocalEndpoint;
bool m_stream;
protected:
FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {}
FlowReceiver(Endpoint const& remoteEndpoint, bool stream)
@ -46,8 +51,17 @@ struct FlowReceiver : public NetworkMessageReceiver {
}
}
bool isLocalEndpoint() { return m_isLocalEndpoint; }
bool isRemoteEndpoint() { return endpoint.isValid() && !m_isLocalEndpoint; }
public:
bool isLocalEndpoint() const { return m_isLocalEndpoint; }
bool isRemoteEndpoint() const { return endpoint.isValid() && !m_isLocalEndpoint; }
void setRemoteEndpoint(Endpoint const& remoteEndpoint, bool stream) {
ASSERT(!m_isLocalEndpoint);
ASSERT(!endpoint.isValid());
endpoint = remoteEndpoint;
m_stream = stream;
FlowTransport::transport().addPeerReference(endpoint, m_stream);
}
// If already a remote endpoint, returns that. Otherwise makes this
// a local endpoint and returns that.
@ -80,12 +94,6 @@ struct FlowReceiver : public NetworkMessageReceiver {
}
const Endpoint& getRawEndpoint() { return endpoint; }
private:
Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_;
Endpoint endpoint;
bool m_isLocalEndpoint;
bool m_stream;
};
template <class T>
@ -363,8 +371,9 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
this->sendError(message.getError());
} else {
if (message.get().asUnderlyingType().acknowledgeToken.present()) {
acknowledgements = AcknowledgementReceiver(
FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()));
acknowledgements.setRemoteEndpoint(
FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()),
false);
if (onConnect.isValid() && onConnect.canBeSet()) {
onConnect.send(Void());
}

View File

@ -120,6 +120,7 @@ public:
bool excludeFromRestarts = false;
std::vector<ProcessInfo*> childs;
bool drProcess = false;
ProcessInfo(const char* name,
LocalityData locality,

View File

@ -1253,6 +1253,7 @@ public:
PromiseTask* task = self->taskQueue.getReadyTask();
self->taskQueue.popReadyTask();
self->execTask(*task);
delete task;
self->yielded = false;
}
}
@ -2275,7 +2276,7 @@ public:
}
// Implementation
struct PromiseTask final {
struct PromiseTask final : public FastAllocated<PromiseTask> {
Promise<Void> promise;
ProcessInfo* machine;
explicit PromiseTask(ProcessInfo* machine) : machine(machine) {}

View File

@ -687,6 +687,9 @@ struct DDQueue : public IDDRelocationQueue {
Reference<EventCacheHolder> movedKeyServersEventHolder;
int moveReusePhysicalShard;
int moveCreateNewPhysicalShard;
void startRelocation(int priority, int healthPriority) {
// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
// we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to
@ -750,7 +753,8 @@ struct DDQueue : public IDDRelocationQueue {
output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0),
suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {}
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
moveCreateNewPhysicalShard(0) {}
DDQueue() = default;
void validate() {
@ -1676,6 +1680,11 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures
ASSERT(physicalShardIDCandidate != UID().first());
if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) {
self->moveReusePhysicalShard++;
} else {
self->moveCreateNewPhysicalShard++;
}
rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
inFlightRange.value().dataMoveId = rd.dataMoveId;
@ -2472,6 +2481,14 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
.trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by
// DataDistributor::movingDataEventHolder. The track latest
// key we use here must match the key used in the holder.
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
TraceEvent("PhysicalShardMoveStats")
.detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
.detail("MoveReusePhysicalShard", self.moveReusePhysicalShard);
self.moveCreateNewPhysicalShard = 0;
self.moveReusePhysicalShard = 0;
}
}
when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
when(wait(waitForAll(ddQueueFutures))) {}

View File

@ -2081,6 +2081,10 @@ void PhysicalShardCollection::logPhysicalShardCollection() {
}
}
bool PhysicalShardCollection::physicalShardExists(uint64_t physicalShardID) {
return physicalShardInstances.find(physicalShardID) != physicalShardInstances.end();
}
// FIXME: complete this test with non-empty range
TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
state DataDistributionTracker self;

View File

@ -25,6 +25,8 @@
#include "fdbclient/DatabaseContext.h"
#include "flow/actorcompiler.h" // This must be the last #include.
FDB_DEFINE_BOOLEAN_PARAM(SkipDDModeCheck);
class DDTxnProcessorImpl {
friend class DDTxnProcessor;
@ -240,7 +242,8 @@ class DDTxnProcessorImpl {
UID distributorId,
MoveKeysLock moveKeysLock,
std::vector<Optional<Key>> remoteDcIds,
const DDEnabledState* ddEnabledState) {
const DDEnabledState* ddEnabledState,
SkipDDModeCheck skipDDModeCheck) {
state Reference<InitialDataDistribution> result = makeReference<InitialDataDistribution>();
state Key beginKey = allKeys.begin;
@ -253,6 +256,7 @@ class DDTxnProcessorImpl {
state std::vector<std::pair<StorageServerInterface, ProcessClass>> tss_servers;
state int numDataMoves = 0;
CODE_PROBE((bool)skipDDModeCheck, "DD Mode won't prevent read initial data distribution.");
// Get the server list in its own try/catch block since it modifies result. We don't want a subsequent failure
// causing entries to be duplicated
loop {
@ -285,7 +289,7 @@ class DDTxnProcessorImpl {
BinaryReader rd(mode.get(), Unversioned());
rd >> result->mode;
}
if (!result->mode || !ddEnabledState->isDDEnabled()) {
if ((!skipDDModeCheck && !result->mode) || !ddEnabledState->isDDEnabled()) {
// DD can be disabled persistently (result->mode = 0) or transiently (isDDEnabled() = 0)
TraceEvent(SevDebug, "GetInitialDataDistribution_DisabledDD").log();
return result;
@ -620,8 +624,10 @@ Future<Reference<InitialDataDistribution>> DDTxnProcessor::getInitialDataDistrib
const UID& distributorId,
const MoveKeysLock& moveKeysLock,
const std::vector<Optional<Key>>& remoteDcIds,
const DDEnabledState* ddEnabledState) {
return DDTxnProcessorImpl::getInitialDataDistribution(cx, distributorId, moveKeysLock, remoteDcIds, ddEnabledState);
const DDEnabledState* ddEnabledState,
SkipDDModeCheck skipDDModeCheck) {
return DDTxnProcessorImpl::getInitialDataDistribution(
cx, distributorId, moveKeysLock, remoteDcIds, ddEnabledState, skipDDModeCheck);
}
Future<Void> DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const {
@ -681,6 +687,33 @@ Future<std::vector<ProcessData>> DDTxnProcessor::getWorkers() const {
return ::getWorkers(cx);
}
Future<Void> DDTxnProcessor::rawStartMovement(MoveKeysParams& params,
std::map<UID, StorageServerInterface>& tssMapping) {
return ::rawStartMovement(cx, params, tssMapping);
}
Future<Void> DDTxnProcessor::rawFinishMovement(MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
return ::rawFinishMovement(cx, params, tssMapping);
}
struct DDMockTxnProcessorImpl {
ACTOR static Future<Void> moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) {
state std::map<UID, StorageServerInterface> tssMapping;
self->rawStartMovement(params, tssMapping);
ASSERT(tssMapping.empty());
if (BUGGIFY_WITH_PROB(0.5)) {
wait(delayJittered(5.0));
}
self->rawFinishMovement(params, tssMapping);
if (!params.dataMovementComplete.isSet())
params.dataMovementComplete.send(Void());
return Void();
}
};
Future<ServerWorkerInfos> DDMockTxnProcessor::getServerListAndProcessClasses() {
ServerWorkerInfos res;
for (auto& [_, mss] : mgs->allServers) {
@ -757,7 +790,8 @@ Future<Reference<InitialDataDistribution>> DDMockTxnProcessor::getInitialDataDis
const UID& distributorId,
const MoveKeysLock& moveKeysLock,
const std::vector<Optional<Key>>& remoteDcIds,
const DDEnabledState* ddEnabledState) {
const DDEnabledState* ddEnabledState,
SkipDDModeCheck skipDDModeCheck) {
// FIXME: now we just ignore ddEnabledState and moveKeysLock, will fix it in the future
Reference<InitialDataDistribution> res = makeReference<InitialDataDistribution>();
@ -817,9 +851,10 @@ void DDMockTxnProcessor::setupMockGlobalState(Reference<InitialDataDistribution>
mgs->shardMapping->setCheckMode(ShardsAffectedByTeamFailure::CheckMode::Normal);
}
// FIXME: finish moveKeys implementation
Future<Void> DDMockTxnProcessor::moveKeys(const MoveKeysParams& params) {
UNREACHABLE();
// Not support location metadata yet
ASSERT(!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
return DDMockTxnProcessorImpl::moveKeys(this, params);
}
// FIXME: finish implementation
@ -851,3 +886,48 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
Future<std::vector<ProcessData>> DDMockTxnProcessor::getWorkers() const {
return Future<std::vector<ProcessData>>();
}
void DDMockTxnProcessor::rawStartMovement(MoveKeysParams& params, std::map<UID, StorageServerInterface>& tssMapping) {
FlowLock::Releaser releaser(*params.startMoveKeysParallelismLock);
// Add wait(take) would always return immediately because there wont be parallel rawStart or rawFinish in mock
// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
ASSERT(params.startMoveKeysParallelismLock->take().isReady());
std::vector<ShardsAffectedByTeamFailure::Team> destTeams;
destTeams.emplace_back(params.destinationTeam, true);
mgs->shardMapping->moveShard(params.keys, destTeams);
for (auto& id : params.destinationTeam) {
mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize);
}
}
void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
const std::map<UID, StorageServerInterface>& tssMapping) {
FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock);
// Add wait(take) would always return immediately because there wont be parallel rawStart or rawFinish in mock
// world due to the fact the following *mock* transaction code will always finish without coroutine switch.
ASSERT(params.finishMoveKeysParallelismLock->take().isReady());
// get source and dest teams
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
ASSERT_EQ(destTeams.size(), 0);
if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {
TraceEvent(SevError, "MockRawFinishMovementError")
.detail("Reason", "InconsistentDestinations")
.detail("ShardMappingDest", describe(destTeams.front().servers))
.detail("ParamDest", describe(params.destinationTeam));
ASSERT(false); // This shouldn't happen because the overlapped key range movement won't be executed in parallel
}
for (auto& id : params.destinationTeam) {
mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::COMPLETED, mgs->restrictSize);
}
ASSERT_EQ(srcTeams.size(), 0);
for (auto& id : srcTeams.front().servers) {
mgs->allServers.at(id).removeShard(params.keys);
}
mgs->shardMapping->finishMove(params.keys);
}

View File

@ -316,7 +316,8 @@ public:
ddId,
lock,
configuration.usableRegions > 1 ? remoteDcIds : std::vector<Optional<Key>>(),
context->ddEnabledState.get()));
context->ddEnabledState.get(),
SkipDDModeCheck::False));
}
void initDcInfo() {
@ -692,6 +693,10 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
"DDTenantCacheMonitor",
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
"StorageUsageTracker",
self->ddId,
&normalDDQueueErrors()));
}
std::vector<DDTeamCollection*> teamCollectionsPtrs;

View File

@ -429,7 +429,7 @@ public:
waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos));
self->writingPos += pageData.size();
return waitForAll(waitfor);
return waitForAllReadyThenThrow(waitfor);
}
// Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem)
@ -655,7 +655,7 @@ public:
for (int i = 0; i < 2; i++)
if (self->files[i].size > 0)
reads.push_back(self->files[i].f->read(self->firstPages[i], sizeof(Page), 0));
wait(waitForAll(reads));
wait(waitForAllReadyThenThrow(reads));
// Determine which file comes first
if (compare(self->firstPages[1], self->firstPages[0])) {
@ -743,7 +743,10 @@ public:
}
// Read nPages from pageOffset*sizeof(Page) offset in file self->files[file]
ACTOR static Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self, int file, int pageOffset, int nPages) {
ACTOR static UNCANCELLABLE Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self,
int file,
int pageOffset,
int nPages) {
state TrackMe trackMe(self);
state const size_t bytesRequested = nPages * sizeof(Page);
state Standalone<StringRef> result = makeAlignedString(sizeof(Page), bytesRequested);

View File

@ -31,6 +31,7 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/Arena.h"
#include "flow/CodeProbe.h"
#include "flow/EncryptUtils.h"
#include "flow/Error.h"
#include "flow/EventTypes.actor.h"
@ -387,6 +388,15 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
try {
KmsConnLookupEKsByKeyIdsReq keysByIdsReq;
for (const auto& item : lookupCipherInfoMap) {
// TODO: Currently getEncryptCipherKeys does not pass the domain name, once that is fixed we can remove
// the check on the empty domain name
if (!item.second.domainName.empty()) {
if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
}
keysByIdsReq.encryptKeyInfos.emplace_back_deep(
keysByIdsReq.arena, item.second.domainId, item.second.baseCipherId, item.second.domainName);
}
@ -452,6 +462,8 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData
keyIdsReply.numHits = cachedCipherDetails.size();
keysByIds.reply.send(keyIdsReply);
CODE_PROBE(!lookupCipherInfoMap.empty(), "EKP fetch cipherKeys by KeyId from KMS");
return Void();
}
@ -475,13 +487,13 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
// Dedup the requested domainIds.
// TODO: endpoint serialization of std::unordered_set isn't working at the moment
std::unordered_map<EncryptCipherDomainId, EKPGetLatestCipherKeysRequestInfo> dedupedDomainInfos;
for (const auto info : req.encryptDomainInfos) {
for (const auto& info : req.encryptDomainInfos) {
dedupedDomainInfos.emplace(info.domainId, info);
}
if (dbgTrace.present()) {
dbgTrace.get().detail("NKeys", dedupedDomainInfos.size());
for (const auto info : dedupedDomainInfos) {
for (const auto& info : dedupedDomainInfos) {
// log encryptDomainIds queried
dbgTrace.get().detail(
getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_QUERY_PREFIX, info.first, info.second.domainName), "");
@ -524,6 +536,11 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
try {
KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq;
for (const auto& item : lookupCipherDomains) {
if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
keysByDomainIdReq.encryptDomainInfos.emplace_back_deep(
keysByDomainIdReq.arena, item.second.domainId, item.second.domainName);
}
@ -588,6 +605,8 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa
latestCipherReply.numHits = cachedCipherDetails.size();
latestKeysReq.reply.send(latestCipherReply);
CODE_PROBE(!lookupCipherDomains.empty(), "EKP fetch latest cipherKeys from KMS");
return Void();
}
@ -610,7 +629,7 @@ bool isBlobMetadataEligibleForRefresh(const BlobMetadataDetailsRef& blobMetadata
return nextRefreshCycleTS > blobMetadata.expireAt || nextRefreshCycleTS > blobMetadata.refreshAt;
}
ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpProxyData,
ACTOR Future<Void> refreshEncryptionKeysImpl(Reference<EncryptKeyProxyData> ekpProxyData,
KmsConnectorInterface kmsConnectorInf) {
state UID debugId = deterministicRandom()->randomUniqueID();
@ -672,6 +691,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
ekpProxyData->baseCipherKeysRefreshed += rep.cipherKeyDetails.size();
t.detail("NumKeys", rep.cipherKeyDetails.size());
CODE_PROBE(!rep.cipherKeyDetails.empty(), "EKP refresh cipherKeys");
} catch (Error& e) {
if (!canReplyWith(e)) {
TraceEvent(SevWarn, "RefreshEKsError").error(e);
@ -685,7 +705,7 @@ ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpP
}
Future<Void> refreshEncryptionKeys(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
return refreshEncryptionKeysCore(ekpProxyData, kmsConnectorInf);
return refreshEncryptionKeysImpl(ekpProxyData, kmsConnectorInf);
}
ACTOR Future<Void> getLatestBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData,
@ -775,7 +795,7 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
state UID debugId = deterministicRandom()->randomUniqueID();
state double startTime;
state TraceEvent t("RefreshBlobMetadata_Start", ekpProxyData->myId);
state TraceEvent t("RefreshBlobMetadataStart", ekpProxyData->myId);
t.setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH);
t.detail("KmsConnInf", kmsConnectorInf.id());
t.detail("DebugId", debugId);
@ -817,7 +837,7 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
t.detail("nKeys", rep.metadataDetails.size());
} catch (Error& e) {
if (!canReplyWith(e)) {
TraceEvent("RefreshBlobMetadata_Error").error(e);
TraceEvent("RefreshBlobMetadataError").error(e);
throw e;
}
TraceEvent("RefreshBlobMetadata").detail("ErrorCode", e.code());
@ -832,24 +852,25 @@ void refreshBlobMetadata(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnect
}
void activateKmsConnector(Reference<EncryptKeyProxyData> ekpProxyData, KmsConnectorInterface kmsConnectorInf) {
if (g_network->isSimulated() || (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0)) {
ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>();
if (g_network->isSimulated()) {
ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>(FDB_SIM_KMS_CONNECTOR_TYPE_STR);
} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(FDB_PREF_KMS_CONNECTOR_TYPE_STR) == 0) {
ekpProxyData->kmsConnector = std::make_unique<SimKmsConnector>(FDB_PREF_KMS_CONNECTOR_TYPE_STR);
} else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare(REST_KMS_CONNECTOR_TYPE_STR) == 0) {
ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>();
ekpProxyData->kmsConnector = std::make_unique<RESTKmsConnector>(REST_KMS_CONNECTOR_TYPE_STR);
} else {
throw not_implemented();
}
TraceEvent("EKPActiveKmsConnector", ekpProxyData->myId)
.detail("ConnectorType",
g_network->isSimulated() ? FDB_SIM_KMS_CONNECTOR_TYPE_STR : SERVER_KNOBS->KMS_CONNECTOR_TYPE)
.detail("ConnectorType", ekpProxyData->kmsConnector->getConnectorStr())
.detail("InfId", kmsConnectorInf.id());
ekpProxyData->addActor.send(ekpProxyData->kmsConnector->connectorCore(kmsConnectorInf));
}
ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface, Reference<AsyncVar<ServerDBInfo>> db) {
state Reference<EncryptKeyProxyData> self(new EncryptKeyProxyData(ekpInterface.id()));
state Reference<EncryptKeyProxyData> self = makeReference<EncryptKeyProxyData>(ekpInterface.id());
state Future<Void> collection = actorCollection(self->addActor.getFuture());
self->addActor.send(traceRole(Role::ENCRYPT_KEY_PROXY, ekpInterface.id()));

View File

@ -53,7 +53,11 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
store->set(KeyValueRef(keyValue.key, pack(keyValue.value)), arena);
}
void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); }
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override {
store->clear(range, storageMetrics, arena);
}
Future<Void> commit(bool sequential = false) override { return store->commit(sequential); }
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {

View File

@ -130,7 +130,7 @@ public:
}
}
void clear(KeyRangeRef range, const Arena* arena) override {
void clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) override {
// A commit that occurs with no available space returns Never, so we can throw out all modifications
if (getAvailableSize() <= 0)
return;

View File

@ -1846,22 +1846,52 @@ struct RocksDBKeyValueStore : IKeyValueStore {
void set(KeyValueRef kv, const Arena*) override {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
keysSet.clear();
}
ASSERT(defaultFdbCF != nullptr);
writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value));
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE) {
keysSet.insert(kv.key);
}
}
void clear(KeyRangeRef keyRange, const Arena*) override {
void clear(KeyRangeRef keyRange, const StorageServerMetrics* storageMetrics, const Arena*) override {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
keysSet.clear();
}
ASSERT(defaultFdbCF != nullptr);
if (keyRange.singleKeyRange()) {
writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
} else {
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
storageMetrics->byteSample.getEstimate(keyRange) <
SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
rocksdb::ReadOptions options = sharedState->getReadOptions();
auto beginSlice = toSlice(keyRange.begin);
auto endSlice = toSlice(keyRange.end);
options.iterate_lower_bound = &beginSlice;
options.iterate_upper_bound = &endSlice;
auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options, defaultFdbCF));
cursor->Seek(toSlice(keyRange.begin));
while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
writeBatch->Delete(defaultFdbCF, cursor->key());
cursor->Next();
}
if (!cursor->status().ok()) {
// if readrange iteration fails, then do a deleteRange.
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
} else {
auto it = keysSet.lower_bound(keyRange.begin);
while (it != keysSet.end() && *it < keyRange.end) {
writeBatch->Delete(defaultFdbCF, toSlice(*it));
it++;
}
}
} else {
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
}
}
}
@ -1890,6 +1920,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
auto a = new Writer::CommitAction();
a->batchToCommit = std::move(writeBatch);
keysSet.clear();
auto res = a->done.getFuture();
writeThread->post(a);
return res;
@ -2083,6 +2114,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Promise<Void> closePromise;
Future<Void> openFuture;
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
std::set<Key> keysSet;
Optional<Future<Void>> metrics;
FlowLock readSemaphore;
int numReadWaiters;

View File

@ -1603,7 +1603,9 @@ public:
StorageBytes getStorageBytes() const override;
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override;
void clear(KeyRangeRef range, const Arena* arena = nullptr) override;
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = nullptr) override;
Future<Void> commit(bool sequential = false) override;
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override;
@ -2222,7 +2224,7 @@ void KeyValueStoreSQLite::set(KeyValueRef keyValue, const Arena* arena) {
++writesRequested;
writeThread->post(new Writer::SetAction(keyValue));
}
void KeyValueStoreSQLite::clear(KeyRangeRef range, const Arena* arena) {
void KeyValueStoreSQLite::clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) {
++writesRequested;
writeThread->post(new Writer::ClearAction(range));
}

View File

@ -49,6 +49,7 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 :
"Unsupported rocksdb version. Update the rocksdb to 6.27.3 version");
const std::string rocksDataFolderSuffix = "-data";
const std::string METADATA_SHARD_ID = "kvs-metadata";
const KeyRef shardMappingPrefix("\xff\xff/ShardMapping/"_sr);
// TODO: move constants to a header file.
const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = "RocksDBStorage"_sr;
@ -304,13 +305,12 @@ rocksdb::ReadOptions getReadOptions() {
}
struct ReadIterator {
rocksdb::ColumnFamilyHandle* cf;
uint64_t index; // incrementing counter to uniquely identify read iterator.
bool inUse;
std::shared_ptr<rocksdb::Iterator> iter;
double creationTime;
ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options)
: cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
: index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
};
/*
@ -475,13 +475,26 @@ struct PhysicalShard {
}
~PhysicalShard() {
if (!deletePending)
return;
logShardEvent(id, ShardOp::CLOSE);
isInitialized.store(false);
readIterPool.reset();
// Destroy CF
auto s = db->DropColumnFamily(cf);
// Deleting default column family is not allowed.
if (id == "default") {
return;
}
if (deletePending) {
auto s = db->DropColumnFamily(cf);
if (!s.ok()) {
logRocksDBError(s, "DestroyShard");
logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
return;
}
}
auto s = db->DestroyColumnFamilyHandle(cf);
if (!s.ok()) {
logRocksDBError(s, "DestroyShard");
logRocksDBError(s, "DestroyCFHandle");
logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString());
return;
}
@ -628,7 +641,7 @@ public:
std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
bool foundMetadata = false;
for (const auto& name : columnFamilies) {
if (name == "kvs-metadata") {
if (name == METADATA_SHARD_ID) {
foundMetadata = true;
}
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
@ -652,19 +665,19 @@ public:
TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
.detail("PhysicalShardCount", handles.size());
std::shared_ptr<PhysicalShard> metadataShard = nullptr;
for (auto handle : handles) {
if (handle->GetName() == "kvs-metadata") {
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
} else {
physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
auto shard = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
if (shard->id == METADATA_SHARD_ID) {
metadataShard = shard;
}
physicalShards[shard->id] = shard;
columnFamilyMap[handle->GetID()] = handle;
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
.detail("PhysicalShard", handle->GetName());
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId).detail("PhysicalShard", shard->id);
}
std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end());
unusedShards.erase("kvs-metadata");
unusedShards.erase(METADATA_SHARD_ID);
unusedShards.erase("default");
KeyRange keyRange = prefixRange(shardMappingPrefix);
@ -746,9 +759,11 @@ public:
defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard);
physicalShards[defaultShard->id] = defaultShard;
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata");
// Create metadata shard.
auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID);
metadataShard->init();
columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf;
physicalShards[METADATA_SHARD_ID] = metadataShard;
// Write special key range metadata.
writeBatch = std::make_unique<rocksdb::WriteBatch>();
@ -763,7 +778,6 @@ public:
TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
writeBatch = std::make_unique<rocksdb::WriteBatch>();
dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
@ -910,6 +924,9 @@ public:
std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) {
std::vector<std::shared_ptr<PhysicalShard>> emptyShards;
double currentTime = now();
TraceEvent(SevInfo, "ShardedRocksDB", logId)
.detail("PendingDeletionShardQueueSize", pendingDeletionShards.size());
while (!pendingDeletionShards.empty()) {
const auto& id = pendingDeletionShards.front();
auto it = physicalShards.find(id);
@ -976,6 +993,10 @@ public:
.detail("Info", "RangeToPersist")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
auto it = physicalShards.find(METADATA_SHARD_ID);
ASSERT(it != physicalShards.end());
auto metadataShard = it->second;
writeBatch->DeleteRange(metadataShard->cf,
getShardMappingKey(range.begin, shardMappingPrefix),
getShardMappingKey(range.end, shardMappingPrefix));
@ -1043,24 +1064,30 @@ public:
}
void closeAllShards() {
for (auto& [_, shard] : physicalShards) {
shard->readIterPool.reset();
}
columnFamilyMap.clear();
physicalShards.clear();
// Close DB.
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
return;
}
TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBClosed");
}
void destroyAllShards() {
closeAllShards();
std::vector<rocksdb::ColumnFamilyDescriptor> cfs;
for (const auto& [key, _] : physicalShards) {
cfs.push_back(rocksdb::ColumnFamilyDescriptor{ key, getCFOptions() });
columnFamilyMap.clear();
for (auto& [_, shard] : physicalShards) {
shard->deletePending = true;
}
auto s = rocksdb::DestroyDB(path, getOptions(), cfs);
physicalShards.clear();
// Close DB.
auto s = db->Close();
if (!s.ok()) {
logRocksDBError(s, "Close");
return;
}
s = rocksdb::DestroyDB(path, getOptions());
if (!s.ok()) {
logRocksDBError(s, "DestroyDB");
}
@ -1121,7 +1148,6 @@ private:
std::unique_ptr<rocksdb::WriteBatch> writeBatch;
std::unique_ptr<std::set<PhysicalShard*>> dirtyShards;
KeyRangeMap<DataShard*> dataShardMap;
std::shared_ptr<PhysicalShard> metadataShard = nullptr;
std::deque<std::string> pendingDeletionShards;
};
@ -2240,6 +2266,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
// TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are
// occurring.
if (g_network->isSimulated()) {
TraceEvent(SevDebug, "ShardedRocksDB").detail("Info", "Use Coro threads in simulation.");
writeThread = CoroThreadPool::createThreadPool();
readThreads = CoroThreadPool::createThreadPool();
} else {
@ -2316,7 +2343,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
void set(KeyValueRef kv, const Arena*) override { shardManager.put(kv.key, kv.value); }
void clear(KeyRangeRef range, const Arena*) override {
void clear(KeyRangeRef range, const StorageServerMetrics*, const Arena*) override {
if (range.singleKeyRange()) {
shardManager.clear(range.begin);
} else {

View File

@ -31,6 +31,106 @@ bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus s
return true;
}
void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
// now the boundary must be aligned
ASSERT(ranges.begin().begin() == range.begin);
ASSERT(ranges.end().end() == range.end);
uint64_t newSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
newSize += it->cvalue().shardSize;
}
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
auto oldStatus = it.value().status;
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
.detail("To", status)
.detail("ID", id)
.detail("KeyBegin", range.begin.toHexString())
.detail("KeyEnd", range.begin.toHexString());
}
}
serverKeys.coalesce(range);
}
// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
// size of the new shards are randomly split from old size of [a, d)
void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
uint64_t outerRangeSize,
bool restrictSize) {
ASSERT(outerRange.contains(innerRange));
Key left = outerRange.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int midSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? outerRangeSize - leftSize - midSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
serverKeys[left].shardSize = leftSize;
serverKeys[innerRange.end].shardSize = rightSize;
}
// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
// size of the new shards are randomly split from old size of [a, c)
void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
KeyRef splitPoint,
uint64_t rangeSize,
bool restrictSize) {
Key left = range.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
: SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? rangeSize - leftSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
serverKeys[left].shardSize = leftSize;
}
void MockStorageServer::removeShard(KeyRangeRef range) {
auto ranges = serverKeys.containedRanges(range);
ASSERT(ranges.begin().range() == range);
serverKeys.rawErase(range);
}
uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
auto ranges = serverKeys.intersectingRanges(range);
uint64_t totalSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
totalSize += it->cvalue().shardSize;
}
return totalSize;
}
void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
ASSERT(conf.storageTeamSize > 0);
configuration = conf;
@ -104,8 +204,78 @@ TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
auto id = MockGlobalState::indexToUID(i);
std::cout << "Check server " << i << "\n";
ASSERT(mgs->serverIsSourceForShard(id, allKeys));
ASSERT(mgs->allServers.at(id).serverKeys.sumRange(allKeys.begin, allKeys.end) == 0);
ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
}
return Void();
}
struct MockGlobalStateTester {
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
void testThreeWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.ranges().begin();
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
MockShardStatus oldStatus = it.cvalue().status;
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
Key x2 = keyAfter(x1);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
ranges.pop_front();
ASSERT(ranges.empty());
}
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
void testTwoWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.nthRange(0);
MockShardStatus oldStatus = it.cvalue().status;
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.empty());
}
};
TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
MockGlobalStateTester tester;
auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
std::cout << "Test 3-way splitting...\n";
tester.testThreeWaySplitFirstRange(mss);
std::cout << "Test 2-way splitting...\n";
mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
tester.testTwoWaySplitFirstRange(mss);
return Void();
}

Some files were not shown because too many files have changed in this diff Show More