diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index e0a1fc31bb..d095353daf 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -274,85 +274,21 @@ if(NOT WIN32) @CLUSTER_FILE@ ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so ) - add_fdbclient_test( - NAME fdb_c_api_tests - DISABLE_LOG_DUMP - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $<TARGET_FILE:fdb_c_api_tester> - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - ) - add_fdbclient_test( - NAME fdb_c_api_tests_local_only - DISABLE_LOG_DUMP - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $<TARGET_FILE:fdb_c_api_tester> - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - ) - - add_fdbclient_test( - NAME fdb_c_api_tests_blob_granule - DISABLE_LOG_DUMP - API_TEST_BLOB_GRANULES_ENABLED - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $<TARGET_FILE:fdb_c_api_tester> - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests - --blob-granule-local-file-path - @DATA_DIR@/fdbblob/ - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - ) - - add_fdbclient_test( - NAME fdb_c_api_tests_with_tls - DISABLE_LOG_DUMP - TLS_ENABLED - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $<TARGET_FILE:fdb_c_api_tester> - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - --tls-cert-file - @CLIENT_CERT_FILE@ - --tls-key-file - @CLIENT_KEY_FILE@ - --tls-ca-file - @SERVER_CA_FILE@ - ) + file(GLOB API_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/test/apitester/tests/*.toml") + foreach(test_file ${API_TEST_FILES}) + get_filename_component(file_name "${test_file}" NAME_WE) + set(test_name "fdb_c_api_test_${file_name}") + add_test(NAME "${test_name}" + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --build-dir ${CMAKE_BINARY_DIR} + --api-tester-bin $<TARGET_FILE:fdb_c_api_tester> + --external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-file ${test_file} + --knob delete-native-lib-after-loading=false + ) + set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300) + endforeach() add_test(NAME fdb_c_upgrade_to_future_version COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml deleted file mode 100644 index b88fc8f694..0000000000 --- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessSingleThr.toml +++ /dev/null @@ -1,15 +0,0 @@ -[[test]] -title = 'Blob Granule API Correctness Single Threaded' -minClients = 1 -maxClients = 3 -multiThreaded = false - - [[test.workload]] - name = 'ApiBlobGranuleCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml deleted file mode 100644 index 85e78975f6..0000000000 --- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml +++ /dev/null @@ -1,15 +0,0 @@ -[[test]] -title = 'Blob Granule Errors Single Threaded' -minClients = 1 -maxClients = 3 -multiThreaded = false - - [[test.workload]] - name = 'BlobGranuleErrors' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index d7d828a756..6fb0148b37 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -279,9 +279,9 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) { return true; } -void fdb_check(fdb::Error e) { - if (e) { - fmt::print(stderr, "Unexpected FDB error: {}({})\n", e.code(), e.what()); +void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) { + if (e.code()) { + fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what()); std::abort(); } } @@ -453,13 +453,13 @@ int main(int argc, char** argv) { applyNetworkOptions(options); fdb::network::setup(); - std::thread network_thread{ &fdb::network::run }; + std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } }; if (!runWorkloads(options)) { retCode = 1; } - fdb_check(fdb::network::stop()); + fdb_check(fdb::network::stop(), "Failed to stop FDB thread"); network_thread.join(); } catch (const std::exception& err) { fmt::print(stderr, "ERROR: {}\n", err.what()); diff --git a/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml deleted file mode 100644 index 9e6fc350ea..0000000000 --- a/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml +++ /dev/null @@ -1,29 +0,0 @@ -[[test]] -title = 'API Correctness Single Threaded' -minClients = 1 -maxClients = 3 -minDatabases = 1 -maxDatabases = 3 -multiThreaded = false -disableClientBypass = true - - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 - - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 - - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - numRandomOperations = 10 diff --git a/bindings/c/test/apitester/run_c_api_tests.py b/bindings/c/test/apitester/run_c_api_tests.py index 4756117c07..b98a18d65a 100755 --- a/bindings/c/test/apitester/run_c_api_tests.py +++ b/bindings/c/test/apitester/run_c_api_tests.py @@ -29,31 +29,39 @@ from pathlib import Path import glob import random import string +import toml + +sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "tests", "TestRunner")] + +# fmt: off +from tmp_cluster import TempCluster +from local_cluster import TLSConfig +# fmt: on TESTER_STATS_INTERVAL_SEC = 5 def random_string(len): - return ''.join(random.choice(string.ascii_letters + string.digits) for i in range(len)) + return "".join(random.choice(string.ascii_letters + string.digits) for i in range(len)) def get_logger(): - return logging.getLogger('foundationdb.run_c_api_tests') + return logging.getLogger("foundationdb.run_c_api_tests") def initialize_logger_level(logging_level): logger = get_logger() - assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] + assert logging_level in ["DEBUG", "INFO", "WARNING", "ERROR"] - logging.basicConfig(format='%(message)s') - if logging_level == 'DEBUG': + logging.basicConfig(format="%(message)s") + if logging_level == "DEBUG": logger.setLevel(logging.DEBUG) - elif logging_level == 'INFO': + elif logging_level == "INFO": logger.setLevel(logging.INFO) - elif logging_level == 'WARNING': + elif logging_level == "WARNING": logger.setLevel(logging.WARNING) - elif logging_level == 'ERROR': + elif logging_level == "ERROR": logger.setLevel(logging.ERROR) @@ -65,35 +73,52 @@ def dump_client_logs(log_dir): print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file)) -def run_tester(args, test_file): - cmd = [args.tester_binary, - "--cluster-file", args.cluster_file, - "--test-file", test_file, - "--stats-interval", str(TESTER_STATS_INTERVAL_SEC*1000)] +def run_tester(args, cluster, test_file): + build_dir = Path(args.build_dir).resolve() + tester_binary = Path(args.api_tester_bin).resolve() + external_client_library = build_dir.joinpath("bindings", "c", "libfdb_c_external.so") + log_dir = Path(cluster.log).joinpath("client") + log_dir.mkdir(exist_ok=True) + cmd = [ + tester_binary, + "--cluster-file", + cluster.cluster_file, + "--test-file", + test_file, + "--stats-interval", + str(TESTER_STATS_INTERVAL_SEC * 1000), + "--tmp-dir", + cluster.tmp_dir, + "--log", + "--log-dir", + str(log_dir), + ] + if args.external_client_library is not None: - cmd += ["--external-client-library", args.external_client_library] - if args.tmp_dir is not None: - cmd += ["--tmp-dir", args.tmp_dir] - log_dir = None - if args.log_dir is not None: - log_dir = Path(args.log_dir).joinpath(random_string(8)) - log_dir.mkdir(exist_ok=True) - cmd += ['--log', "--log-dir", str(log_dir)] + external_client_library = Path(args.external_client_library).resolve() + cmd += ["--external-client-library", external_client_library] - if args.blob_granule_local_file_path is not None: - cmd += ["--blob-granule-local-file-path", - args.blob_granule_local_file_path] + if cluster.blob_granules_enabled: + cmd += [ + "--blob-granule-local-file-path", + str(cluster.data.joinpath("fdbblob")) + os.sep, + ] - if args.tls_ca_file is not None: - cmd += ["--tls-ca-file", args.tls_ca_file] + if cluster.tls_config is not None: + cmd += [ + "--tls-ca-file", + cluster.server_ca_file, + "--tls-key-file", + cluster.client_key_file, + "--tls-cert-file", + cluster.client_cert_file, + ] - if args.tls_key_file is not None: - cmd += ["--tls-key-file", args.tls_key_file] + for knob in args.knobs: + knob_name, knob_value = knob.split("=") + cmd += ["--knob-" + knob_name, knob_value] - if args.tls_cert_file is not None: - cmd += ["--tls-cert-file", args.tls_cert_file] - - get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd)) + get_logger().info("\nRunning tester '%s'..." % " ".join(map(str, cmd))) proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) timed_out = False ret_code = 1 @@ -103,34 +128,76 @@ def run_tester(args, test_file): proc.kill() timed_out = True except Exception as e: - raise Exception('Unable to run tester (%s)' % e) + raise Exception("Unable to run tester (%s)" % e) if ret_code != 0: if timed_out: - reason = 'timed out after %d seconds' % args.timeout + reason = "timed out after %d seconds" % args.timeout elif ret_code < 0: reason = signal.Signals(-ret_code).name else: - reason = 'exit code: %d' % ret_code - get_logger().error('\n\'%s\' did not complete succesfully (%s)' % - (cmd[0], reason)) - if (log_dir is not None): + reason = "exit code: %d" % ret_code + get_logger().error("\n'%s' did not complete succesfully (%s)" % (cmd[0], reason)) + if log_dir is not None: dump_client_logs(log_dir) - get_logger().info('') + get_logger().info("") return ret_code +class TestConfig: + def __init__(self, test_file): + config = toml.load(test_file) + server_config = config.get("server", [{}])[0] + self.tenants_enabled = server_config.get("tenants_enabled", True) + self.blob_granules_enabled = server_config.get("blob_granules_enabled", False) + self.tls_enabled = server_config.get("tls_enabled", False) + self.client_chain_len = server_config.get("tls_client_chain_len", 2) + self.server_chain_len = server_config.get("tls_server_chain_len", 3) + self.min_num_processes = server_config.get("min_num_processes", 1) + self.max_num_processes = server_config.get("max_num_processes", 3) + self.num_processes = random.randint(self.min_num_processes, self.max_num_processes) + + +def run_test(args, test_file): + config = TestConfig(test_file) + + tls_config = None + if config.tls_enabled: + tls_config = TLSConfig( + server_chain_len=config.client_chain_len, + client_chain_len=config.server_chain_len, + ) + + with TempCluster( + args.build_dir, + config.num_processes, + enable_tenants=config.tenants_enabled, + blob_granules_enabled=config.blob_granules_enabled, + tls_config=tls_config, + ) as cluster: + ret_code = run_tester(args, cluster, test_file) + if not cluster.check_cluster_logs(): + ret_code = 1 if ret_code == 0 else ret_code + return ret_code + + def run_tests(args): num_failed = 0 - test_files = [f for f in os.listdir(args.test_dir) if os.path.isfile( - os.path.join(args.test_dir, f)) and f.endswith(".toml")] + if args.test_file is not None: + test_files = [Path(args.test_file).resolve()] + else: + test_files = [ + f + for f in os.listdir(args.test_dir) + if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml") + ] for test_file in test_files: - get_logger().info('=========================================================') - get_logger().info('Running test %s' % test_file) - get_logger().info('=========================================================') - ret_code = run_tester(args, os.path.join(args.test_dir, test_file)) + get_logger().info("=========================================================") + get_logger().info("Running test %s" % test_file) + get_logger().info("=========================================================") + ret_code = run_test(args, os.path.join(args.test_dir, test_file)) if ret_code != 0: num_failed += 1 @@ -138,32 +205,49 @@ def run_tests(args): def parse_args(argv): - parser = argparse.ArgumentParser(description='FoundationDB C API Tester') - - parser.add_argument('--cluster-file', type=str, default="fdb.cluster", - help='The cluster file for the cluster being connected to. (default: fdb.cluster)') - parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester", - help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)') - parser.add_argument('--external-client-library', type=str, default=None, - help='Path to the external client library. (default: None)') - parser.add_argument('--test-dir', type=str, default="./", - help='Path to a directory with test definitions. (default: ./)') - parser.add_argument('--timeout', type=int, default=300, - help='The timeout in seconds for running each individual test. (default 300)') - parser.add_argument('--log-dir', type=str, default=None, - help='The directory for storing logs (default: None)') - parser.add_argument('--logging-level', type=str, default='INFO', - choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').') - parser.add_argument('--tmp-dir', type=str, default=None, - help='The directory for storing temporary files (default: None)') - parser.add_argument('--blob-granule-local-file-path', type=str, default=None, - help='Enable blob granule tests if set, value is path to local blob granule files') - parser.add_argument('--tls-ca-file', type=str, default=None, - help='Path to client\'s TLS CA file: i.e. certificate of CA that signed the server certificate') - parser.add_argument('--tls-cert-file', type=str, default=None, - help='Path to client\'s TLS certificate file') - parser.add_argument('--tls-key-file', type=str, default=None, - help='Path to client\'s TLS private key file') + parser = argparse.ArgumentParser(description="FoundationDB C API Tester") + parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory") + parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True) + parser.add_argument("--external-client-library", type=str, help="Path to the external client library.") + parser.add_argument( + "--cluster-file", + type=str, + default="fdb.cluster", + help="The cluster file for the cluster being connected to. (default: fdb.cluster)", + ) + parser.add_argument( + "--test-dir", + type=str, + default="./", + help="Path to a directory with test definitions. (default: ./)", + ) + parser.add_argument( + "--test-file", + type=str, + default=None, + help="Path to a single test definition to be executed, overrides --test-dir if set.", + ) + parser.add_argument( + "--timeout", + type=int, + default=300, + help="The timeout in seconds for running each individual test. (default 300)", + ) + parser.add_argument( + "--logging-level", + type=str, + default="INFO", + choices=["ERROR", "WARNING", "INFO", "DEBUG"], + help="Specifies the level of detail in the tester output (default='INFO').", + ) + parser.add_argument( + "--knob", + type=str, + default=[], + action="append", + dest="knobs", + help="[lowercase-knob-name]=[knob-value] (there may be multiple --knob options)", + ) return parser.parse_args(argv) @@ -174,5 +258,5 @@ def main(argv): return run_tests(args) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessBlocking.toml similarity index 51% rename from bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml rename to bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessBlocking.toml index 930c8dd0df..c686d956f8 100644 --- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessBlocking.toml +++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessBlocking.toml @@ -12,13 +12,15 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 +[[server]] +blob_granules_enabled = true - [[test.workload]] - name = 'ApiBlobGranuleCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 \ No newline at end of file +[[test.workload]] +name = 'ApiBlobGranuleCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessMultiThr.toml similarity index 50% rename from bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml rename to bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessMultiThr.toml index fdd4a0349c..637d15df38 100644 --- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleCorrectnessMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessMultiThr.toml @@ -11,13 +11,15 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'ApiBlobGranuleCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 +[[server]] +blob_granules_enabled = true +[[test.workload]] +name = 'ApiBlobGranuleCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml new file mode 100644 index 0000000000..45af323685 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiBlobGranuleCorrectnessSingleThr.toml @@ -0,0 +1,18 @@ +[[test]] +title = 'Blob Granule API Correctness Single Threaded' +minClients = 1 +maxClients = 3 +multiThreaded = false + +[[server]] +blob_granules_enabled = true + +[[test.workload]] +name = 'ApiBlobGranuleCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsMultiThr.toml similarity index 50% rename from bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml rename to bindings/c/test/apitester/tests/CApiBlobGranuleErrorsMultiThr.toml index 788bd04d85..598469327e 100644 --- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsMultiThr.toml @@ -11,12 +11,15 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'BlobGranuleErrors' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 \ No newline at end of file +[[server]] +blob_granules_enabled = true + +[[test.workload]] +name = 'BlobGranuleErrors' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsOnExternalThread.toml similarity index 50% rename from bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml rename to bindings/c/test/apitester/tests/CApiBlobGranuleErrorsOnExternalThread.toml index 788bd04d85..598469327e 100644 --- a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml +++ b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsOnExternalThread.toml @@ -11,12 +11,15 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'BlobGranuleErrors' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 \ No newline at end of file +[[server]] +blob_granules_enabled = true + +[[test.workload]] +name = 'BlobGranuleErrors' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml new file mode 100644 index 0000000000..542f238b7b --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiBlobGranuleErrorsSingleThr.toml @@ -0,0 +1,18 @@ +[[test]] +title = 'Blob Granule Errors Single Threaded' +minClients = 1 +maxClients = 3 +multiThreaded = false + +[[server]] +blob_granules_enabled = true + +[[test.workload]] +name = 'BlobGranuleErrors' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml index 9f153645e7..df92c89498 100644 --- a/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml @@ -12,13 +12,13 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'CancelTransaction' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file +[[test.workload]] +name = 'CancelTransaction' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml index 96108c69b1..c03b65eea8 100644 --- a/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml @@ -11,13 +11,13 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'CancelTransaction' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file +[[test.workload]] +name = 'CancelTransaction' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml index ae40fbf696..9381c4ef52 100644 --- a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml @@ -12,13 +12,13 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'CancelTransaction' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file +[[test.workload]] +name = 'CancelTransaction' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml new file mode 100644 index 0000000000..aa714a9cd4 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX_TLS.toml @@ -0,0 +1,28 @@ +[[test]] +title = 'Cancel Transaction with Database per Transaction with TLS' +multiThreaded = true +buggify = true +databasePerTransaction = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + +[[server]] +tls_enabled = true +max_num_processes = 1 + +[[test.workload]] +name = 'CancelTransaction' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml index 5a9e4f9b8f..ac4a3e1d4d 100644 --- a/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionWithTimeout.toml @@ -11,15 +11,15 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'CancelTransaction' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 - minTxTimeoutMs = 10 - maxTxTimeoutMs = 10000 \ No newline at end of file +[[test.workload]] +name = 'CancelTransaction' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 +minTxTimeoutMs = 10 +maxTxTimeoutMs = 10000 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml index 10f6630d79..4ba1c84e4f 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml @@ -12,23 +12,23 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - numRandomOperations = 10 +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml index bd6a437fdb..cd7d71d5a3 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml @@ -12,23 +12,23 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - numRandomOperations = 10 +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml index fe5181642d..20ffba3f37 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml @@ -12,23 +12,23 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - numRandomOperations = 10 +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml b/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml new file mode 100644 index 0000000000..622f507495 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessDisableBypass.toml @@ -0,0 +1,29 @@ +[[test]] +title = 'API Correctness Single Threaded' +minClients = 1 +maxClients = 3 +minDatabases = 1 +maxDatabases = 3 +multiThreaded = false +disableClientBypass = true + +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 + +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 + +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml index 1fd0cafd15..0ba47110cd 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml @@ -11,23 +11,23 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - numRandomOperations = 10 +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml index cb7929f74d..4f41f28c87 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml @@ -4,23 +4,23 @@ minClients = 1 maxClients = 3 multiThreaded = false - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - numRandomOperations = 10 +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml b/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml new file mode 100644 index 0000000000..5f496692d0 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessTLS.toml @@ -0,0 +1,37 @@ +[[test]] +title = 'API Correctness with TLS' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + +[[server]] +tls_enabled = true +max_num_processes = 1 + +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 + +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 + +[[test.workload]] +name = 'WatchAndWait' +initialSize = 0 +numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml b/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml index 4da54431b1..bce2bbf5df 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessWithTimeout.toml @@ -11,23 +11,22 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 - minTxTimeoutMs = 100 - maxTxTimeoutMs = 10000 - - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - numRandomOperations = 100 - minTxTimeoutMs = 100 - maxTxTimeoutMs = 10000 +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 +minTxTimeoutMs = 100 +maxTxTimeoutMs = 10000 +[[test.workload]] +name = 'AtomicOpsCorrectness' +initialSize = 0 +numRandomOperations = 100 +minTxTimeoutMs = 100 +maxTxTimeoutMs = 10000 diff --git a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml index 2a5a0d30e1..f789a3cabc 100644 --- a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml @@ -9,13 +9,13 @@ maxClients = 8 minTenants = 2 maxTenants = 5 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 5 - initialSize = 100 - numRandomOperations = 200 - readExistingKeysRatio = 0.9 \ No newline at end of file +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 5 +initialSize = 100 +numRandomOperations = 200 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml new file mode 100644 index 0000000000..6bfd3bae2a --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessTLS.toml @@ -0,0 +1,25 @@ +[[test]] +title = 'Multi-tenant API Correctness Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minClients = 2 +maxClients = 8 +minTenants = 2 +maxTenants = 5 + +[[server]] +tls_enabled = true +max_num_processes = 1 + +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 5 +initialSize = 100 +numRandomOperations = 200 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml index 60a9715bd8..fa014bc174 100644 --- a/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml +++ b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml @@ -12,13 +12,13 @@ maxClientThreads = 4 minClients = 2 maxClients = 4 - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml b/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml new file mode 100644 index 0000000000..b0eb1777fc --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiTamperClusterFileTLS.toml @@ -0,0 +1,28 @@ +[[test]] +title = 'Test tampering the cluster file with TLS' +multiThreaded = true +buggify = true +tamperClusterFile = true +minFdbThreads = 2 +maxFdbThreads = 4 +minDatabases = 2 +maxDatabases = 4 +minClientThreads = 2 +maxClientThreads = 4 +minClients = 2 +maxClients = 4 + +[[server]] +tls_enabled = true +max_num_processes = 1 + +[[test.workload]] +name = 'ApiCorrectness' +minKeyLength = 1 +maxKeyLength = 64 +minValueLength = 1 +maxValueLength = 1000 +maxKeysPerTransaction = 50 +initialSize = 100 +numRandomOperations = 100 +readExistingKeysRatio = 0.9 diff --git a/bindings/c/test/client_memory_test.cpp b/bindings/c/test/client_memory_test.cpp index 3ea2f74a8a..4cc669ad79 100644 --- a/bindings/c/test/client_memory_test.cpp +++ b/bindings/c/test/client_memory_test.cpp @@ -46,7 +46,7 @@ int main(int argc, char** argv) { } fdb_check(fdb_select_api_version(FDB_API_VERSION)); fdb_check(fdb_setup_network()); - std::thread network_thread{ &fdb_run_network }; + std::thread network_thread{ [] { fdb_check(fdb_run_network()); } }; fdb_check( fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0)); diff --git a/bindings/c/test/mako/mako.cpp b/bindings/c/test/mako/mako.cpp index 858dd1dc8f..f9f3827a70 100644 --- a/bindings/c/test/mako/mako.cpp +++ b/bindings/c/test/mako/mako.cpp @@ -321,7 +321,16 @@ int populate(Database db, const auto key_begin = insertBegin(args.rows, worker_id, thread_id, args.num_processes, args.num_threads); const auto key_end = insertEnd(args.rows, worker_id, thread_id, args.num_processes, args.num_threads); auto key_checkpoint = key_begin; // in case of commit failure, restart from this key + double required_keys = (key_end - key_begin + 1) * args.load_factor; for (auto i = key_begin; i <= key_end; i++) { + // Choose required_keys out of (key_end -i + 1) randomly, so the probability is required_keys / (key_end - i + // + 1). Generate a random number in range [0, 1), if the generated number is smaller or equal to + // required_keys / (key_end - i + 1), then choose this key. + double r = rand() / (1.0 + RAND_MAX); + if (r > required_keys / (key_end - i + 1)) { + continue; + } + --required_keys; /* sequential keys */ genKey(keystr.data(), KEY_PREFIX, args, i); /* random values */ @@ -984,6 +993,7 @@ int initArguments(Arguments& args) { args.async_xacts = 0; args.mode = MODE_INVALID; args.rows = 100000; + args.load_factor = 1.0; args.row_digits = digits(args.rows); args.seconds = 30; args.iteration = 0; @@ -1166,6 +1176,7 @@ void usage() { printf("%-24s %s\n", "-t, --threads=THREADS", "Specify number of worker threads"); printf("%-24s %s\n", " --async_xacts", "Specify number of concurrent transactions to be run in async mode"); printf("%-24s %s\n", "-r, --rows=ROWS", "Specify number of records"); + printf("%-24s %s\n", "-l, --load_factor=LOAD_FACTOR", "Specify load factor"); printf("%-24s %s\n", "-s, --seconds=SECONDS", "Specify the test duration in seconds\n"); printf("%-24s %s\n", "", "This option cannot be specified with --iteration."); printf("%-24s %s\n", "-i, --iteration=ITERS", "Specify the number of iterations.\n"); @@ -1228,6 +1239,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) { { "threads", required_argument, NULL, 't' }, { "async_xacts", required_argument, NULL, ARG_ASYNC }, { "rows", required_argument, NULL, 'r' }, + { "load_factor", required_argument, NULL, 'l' }, { "seconds", required_argument, NULL, 's' }, { "iteration", required_argument, NULL, 'i' }, { "keylen", required_argument, NULL, ARG_KEYLEN }, @@ -1304,6 +1316,9 @@ int parseArguments(int argc, char* argv[], Arguments& args) { args.rows = atoi(optarg); args.row_digits = digits(args.rows); break; + case 'l': + args.load_factor = atof(optarg); + break; case 's': args.seconds = atoi(optarg); break; @@ -1523,6 +1538,10 @@ int validateArguments(Arguments const& args) { logr.error("--rows must be a positive integer"); return -1; } + if (args.load_factor <= 0 || args.load_factor > 1) { + logr.error("--load_factor must be in range (0, 1]"); + return -1; + } if (args.key_length < 0) { logr.error("--keylen must be a positive integer"); return -1; @@ -2118,6 +2137,7 @@ int statsProcessMain(Arguments const& args, fmt::fprintf(fp, "\"async_xacts\": %d,", args.async_xacts); fmt::fprintf(fp, "\"mode\": %d,", args.mode); fmt::fprintf(fp, "\"rows\": %d,", args.rows); + fmt::fprintf(fp, "\"load_factor\": %lf,", args.load_factor); fmt::fprintf(fp, "\"seconds\": %d,", args.seconds); fmt::fprintf(fp, "\"iteration\": %d,", args.iteration); fmt::fprintf(fp, "\"tpsmax\": %d,", args.tpsmax); diff --git a/bindings/c/test/mako/mako.hpp b/bindings/c/test/mako/mako.hpp index bafe65b546..952cffc7fa 100644 --- a/bindings/c/test/mako/mako.hpp +++ b/bindings/c/test/mako/mako.hpp @@ -138,6 +138,7 @@ struct Arguments { int async_xacts; int mode; int rows; /* is 2 billion enough? */ + double load_factor; int row_digits; int seconds; int iteration; diff --git a/bindings/c/test/shim_lib_tester.cpp b/bindings/c/test/shim_lib_tester.cpp index c8813c45d0..31dc17ce5c 100644 --- a/bindings/c/test/shim_lib_tester.cpp +++ b/bindings/c/test/shim_lib_tester.cpp @@ -233,7 +233,7 @@ int main(int argc, char** argv) { applyNetworkOptions(options); fdb::network::setup(); - std::thread network_thread{ &fdb::network::run }; + std::thread network_thread{ [] { fdb_check(fdb::network::run(), "FDB network thread failed"); } }; // Try calling some basic functionality that is available // in all recent API versions diff --git a/bindings/c/test/unit/disconnected_timeout_tests.cpp b/bindings/c/test/unit/disconnected_timeout_tests.cpp index 7d006faa23..3684e9bc7a 100644 --- a/bindings/c/test/unit/disconnected_timeout_tests.cpp +++ b/bindings/c/test/unit/disconnected_timeout_tests.cpp @@ -271,7 +271,7 @@ int main(int argc, char** argv) { context.applyCommandLine(argc, argv); fdb_check(fdb_setup_network()); - std::thread network_thread{ &fdb_run_network }; + std::thread network_thread{ [] { fdb_check(fdb_run_network()); } }; db = fdb_open_database(argv[1]); timeoutDb = fdb_open_database(argv[1]); diff --git a/bindings/c/test/unit/setup_tests.cpp b/bindings/c/test/unit/setup_tests.cpp index 2e96eb00b9..d4f809b052 100644 --- a/bindings/c/test/unit/setup_tests.cpp +++ b/bindings/c/test/unit/setup_tests.cpp @@ -66,7 +66,7 @@ TEST_CASE("setup") { }, &context)); - std::thread network_thread{ &fdb_run_network }; + std::thread network_thread{ [] { fdb_check(fdb_run_network()); } }; CHECK(!context.called); fdb_check(fdb_stop_network()); diff --git a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp index 73dc8132a5..101d913fff 100644 --- a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp +++ b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp @@ -68,7 +68,7 @@ int main(int argc, char** argv) { set_net_opt(FDBNetworkOption::FDB_NET_OPTION_TRACE_PARTIAL_FILE_SUFFIX, trace_partial_file_suffix); fdb_check(fdb_setup_network()); - std::thread network_thread{ &fdb_run_network }; + std::thread network_thread{ [] { fdb_check(fdb_run_network()); } }; // Apparently you need to open a database to initialize logging FDBDatabase* out; diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index a45221f606..ea77613f3f 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -2998,7 +2998,7 @@ int main(int argc, char** argv) { context.applyCommandLine(argc, argv); fdb_check(fdb_setup_network()); - std::thread network_thread{ &fdb_run_network }; + std::thread network_thread{ [] { fdb_check(fdb_run_network()); } }; db = fdb_open_database(argv[1]); clusterFilePath = std::string(argv[1]); diff --git a/bindings/c/test/unit/unit_tests_version_510.cpp b/bindings/c/test/unit/unit_tests_version_510.cpp index ff369316ee..4160fce780 100644 --- a/bindings/c/test/unit/unit_tests_version_510.cpp +++ b/bindings/c/test/unit/unit_tests_version_510.cpp @@ -88,7 +88,7 @@ int main(int argc, char** argv) { context.applyCommandLine(argc, argv); fdb_check(fdb_setup_network()); - std::thread network_thread{ &fdb_run_network }; + std::thread network_thread{ [] { fdb_check(fdb_run_network()); } }; { FDBCluster* cluster; diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index 9cd8f09fe6..b765e09508 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -392,11 +392,6 @@ func (o DatabaseOptions) SetTransactionIncludePortInAddress() error { return o.setOpt(505, nil) } -// Set a random idempotency id for all transactions. See the transaction option description for more information. -func (o DatabaseOptions) SetTransactionAutomaticIdempotency() error { - return o.setOpt(506, nil) -} - // Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information. func (o DatabaseOptions) SetTransactionBypassUnreadable() error { return o.setOpt(700, nil) @@ -556,18 +551,6 @@ func (o TransactionOptions) SetSizeLimit(param int64) error { return o.setOpt(503, int64ToBytes(param)) } -// Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. -// -// Parameter: Unique ID -func (o TransactionOptions) SetIdempotencyId(param string) error { - return o.setOpt(504, []byte(param)) -} - -// Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. -func (o TransactionOptions) SetAutomaticIdempotency() error { - return o.setOpt(505, nil) -} - // Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior. func (o TransactionOptions) SetSnapshotRywEnable() error { return o.setOpt(600, nil) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index f6c298ddfe..5fc6849d67 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -320,11 +320,11 @@ function(create_long_running_correctness_package) add_custom_command( OUTPUT ${tar_file} DEPENDS ${package_files} - ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh - ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh + ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh + ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTest.sh ${out_dir}/joshua_test - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh ${out_dir}/joshua_timeout COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files} ${out_dir}/joshua_test diff --git a/contrib/Joshua/scripts/longRunningCorrectnessTest.sh b/contrib/Joshua/scripts/longRunningCorrectnessTest.sh new file mode 100755 index 0000000000..66551644af --- /dev/null +++ b/contrib/Joshua/scripts/longRunningCorrectnessTest.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +# Simulation currently has memory leaks. We need to investigate before we can enable leak detection in joshua. +export ASAN_OPTIONS="detect_leaks=0" + +OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" +#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false + +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --long-running diff --git a/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh b/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh new file mode 100755 index 0000000000..d7bf3ba81f --- /dev/null +++ b/contrib/Joshua/scripts/longRunningCorrectnessTimeout.sh @@ -0,0 +1,3 @@ +#!/bin/bash -u + +python3 -m test_harness.timeout --long-running diff --git a/contrib/TestHarness2/test_harness/config.py b/contrib/TestHarness2/test_harness/config.py index 191fab629d..dcde82cf87 100644 --- a/contrib/TestHarness2/test_harness/config.py +++ b/contrib/TestHarness2/test_harness/config.py @@ -184,6 +184,8 @@ class Config: self.reproduce_prefix: str | None = None self.reproduce_prefix_args = {'type': str, 'required': False, 'help': 'When printing the results, prepend this string to the command'} + self.long_running: bool = False + self.long_running_args = {'action': 'store_true'} self._env_names: Dict[str, str] = {} self._config_map = self._build_map() self._read_env() diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py index 2cd24575fb..d9c7238d1b 100644 --- a/contrib/TestHarness2/test_harness/run.py +++ b/contrib/TestHarness2/test_harness/run.py @@ -303,6 +303,7 @@ class TestRun: self.stats: str | None = stats self.expected_unseed: int | None = expected_unseed self.use_valgrind: bool = config.use_valgrind + self.long_running: bool = config.long_running self.old_binary_path: Path = config.old_binaries_path self.buggify_enabled: bool = buggify_enabled self.fault_injection_enabled: bool = True @@ -375,7 +376,7 @@ class TestRun: process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path, text=True, env=env) did_kill = False - timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds + timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds err_out: str try: _, err_out = process.communicate(timeout=timeout) diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py index 54b2f799b5..39eae8803e 100644 --- a/contrib/TestHarness2/test_harness/summarize.py +++ b/contrib/TestHarness2/test_harness/summarize.py @@ -384,6 +384,7 @@ class Summary: child.attributes['Severity'] = '40' child.attributes['ErrorCount'] = str(self.errors) self.out.append(child) + self.error = True if self.was_killed: child = SummaryTree('ExternalTimeout') child.attributes['Severity'] = '40' @@ -420,6 +421,7 @@ class Summary: child = SummaryTree('TestUnexpectedlyNotFinished') child.attributes['Severity'] = '40' self.out.append(child) + self.error = True if self.error_out is not None and len(self.error_out) > 0: lines = self.error_out.splitlines() stderr_bytes = 0 diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index f55a34cf4f..bb2275b622 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -524,6 +524,12 @@ The ``start`` command will start a new restore on the specified (or default) tag ``--inconsistent-snapshot-only`` Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use. +``--user-data`` + Restore only the user keyspace. This option should NOT be used alongside --system-metadata (below) and CANNOT be used alongside other specified key ranges. + +``--system-metadata`` + Restore only the relevant system keyspace. This option should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges. + .. program:: fdbrestore abort ``abort`` diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst index 0eb159e8f4..95126a5711 100644 --- a/documentation/sphinx/source/client-testing.rst +++ b/documentation/sphinx/source/client-testing.rst @@ -648,6 +648,16 @@ The subclasses of the ``ApiWorkload`` inherit the following configuration option initiated by a test script to check if the client workload is successfully progressing after a cluster change. +The FDB server configuration can be specialized in the section ``[[server]]``: + +- ``tenants_enabled``: enable multitenancy (default: true) +- ``blob_granules_enabled``: enable support for blob granules (default: false) +- ``tls_enabled``: enable TLS (default: false) +- ``tls_client_chain_len``: the length of the client-side TLS chain (default: 2) +- ``tls_server_chain_len``: the length of the server-side TLS chain (default: 3) +- ``min_num_processes`` and ``max_num_processes``: the number of FDB server processes to be + randomly selected from the given range (default 1-3) + Executing the Tests =================== @@ -656,19 +666,35 @@ according to its specification. Before that we must create a FDB cluster and pas a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an external client library. -For example, we can create a temporary cluster and use it for execution of one of the existing API tests: +The ``run_c_api_tests.py`` script automates execution of the API tests on a local cluster. The cluster +is created according to the options specified in the ``[[server]]`` section of the given test file. .. code-block:: bash - ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \ - ${buildDir}/bin/fdb_c_api_tester \ - --cluster-file @CLUSTER_FILE@ \ - --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \ + ${srcDir}/bindings/c/test/apitester/run_c_api_tests.py + --build-dir ${buildDir} + --api-tester-bin ${buildDir}/bin/fdb_c_api_tester + --external-client-library ${buildDir}/bindings/c/libfdb_c_external.so --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part -of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``: +of the regression test suite as ``ctest`` targets with names ``fdb_c_api_test_{file_name}``. + +The ``ctest`` targets provide a more convenient way for executing the API tests. We can execute +a single test: .. code-block:: bash - ctest -R fdb_c_api_tests -VV + ctest -R fdb_c_api_test_CApiCorrectnessMultiThr -VV + +or execute all of them in parallel (here ``-j20`` specifies the parallelization level): + +.. code-block:: bash + + ctest -R fdb_c_api_test_ -j20 --output-on-failure + +More sophisticated filters can be applied to execute a selected set of tests, e.g. the tests using TLS: + +.. code-block:: bash + + ctest -R 'fdb_c_api_test_.*TLS' -j20 --output_on_failure diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 24601308e1..a55a6f83df 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -47,6 +47,7 @@ #include "fdbclient/IKnobCollection.h" #include "fdbclient/RunTransaction.actor.h" #include "fdbclient/S3BlobStore.h" +#include "fdbclient/SystemData.h" #include "fdbclient/json_spirit/json_spirit_writer_template.h" #include "flow/Platform.h" @@ -155,6 +156,11 @@ enum { OPT_RESTORE_CLUSTERFILE_ORIG, OPT_RESTORE_BEGIN_VERSION, OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY, + // The two restore options below allow callers of fdbrestore to divide a normal restore into one which restores just + // the system keyspace and another that restores just the user key space. This is unlike the backup command where + // all keys (both system and user) will be backed up together + OPT_RESTORE_USER_DATA, + OPT_RESTORE_SYSTEM_DATA, // Shared constants OPT_CLUSTERFILE, @@ -696,6 +702,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = { { OPT_BACKUPKEYS, "--keys", SO_REQ_SEP }, { OPT_WAITFORDONE, "-w", SO_NONE }, { OPT_WAITFORDONE, "--waitfordone", SO_NONE }, + { OPT_RESTORE_USER_DATA, "--user-data", SO_NONE }, + { OPT_RESTORE_SYSTEM_DATA, "--system-metadata", SO_NONE }, { OPT_RESTORE_VERSION, "--version", SO_REQ_SEP }, { OPT_RESTORE_VERSION, "-v", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, @@ -1187,6 +1195,13 @@ static void printRestoreUsage(bool devhelp) { printf(" The cluster file for the original database from which the backup was created. The " "original database\n"); printf(" is only needed to convert a --timestamp argument to a database version.\n"); + printf(" --user-data\n" + " Restore only the user keyspace. This option should NOT be used alongside " + "--system-metadata (below) and CANNOT be used alongside other specified key ranges.\n"); + printf( + " --system-metadata\n" + " Restore only the relevant system keyspace. This option " + "should NOT be used alongside --user-data (above) and CANNOT be used alongside other specified key ranges.\n"); if (devhelp) { #ifdef _WIN32 @@ -3367,6 +3382,8 @@ int main(int argc, char* argv[]) { bool trace = false; bool quietDisplay = false; bool dryRun = false; + bool restoreSystemKeys = false; + bool restoreUserKeys = false; // TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start bool encryptionEnabled = true; std::string traceDir = ""; @@ -3691,6 +3708,14 @@ int main(int argc, char* argv[]) { restoreVersion = ver; break; } + case OPT_RESTORE_USER_DATA: { + restoreUserKeys = true; + break; + } + case OPT_RESTORE_SYSTEM_DATA: { + restoreSystemKeys = true; + break; + } case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: { inconsistentSnapshotOnly.set(true); break; @@ -3838,6 +3863,11 @@ int main(int argc, char* argv[]) { } } + if (restoreSystemKeys && restoreUserKeys) { + fprintf(stderr, "ERROR: Please only specify one of --user-data or --system-metadata, not both\n"); + return FDB_EXIT_ERROR; + } + if (trace) { if (!traceLogGroup.empty()) setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(traceLogGroup)); @@ -3938,10 +3968,30 @@ int main(int argc, char* argv[]) { // The fastrestore tool does not yet support multiple ranges and is incompatible with tenants // or other features that back up data in the system keys - if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) { + if (!restoreSystemKeys && !restoreUserKeys && backupKeys.empty() && + programExe != ProgramExe::FASTRESTORE_TOOL) { addDefaultBackupRanges(backupKeys); } + if ((restoreSystemKeys || restoreUserKeys) && programExe == ProgramExe::FASTRESTORE_TOOL) { + fprintf(stderr, "ERROR: Options: --user-data and --system-metadata are not supported with fastrestore\n"); + return FDB_EXIT_ERROR; + } + + if ((restoreUserKeys || restoreSystemKeys) && !backupKeys.empty()) { + fprintf(stderr, + "ERROR: Cannot specify additional ranges when using --user-data or --system-metadata " + "options\n"); + return FDB_EXIT_ERROR; + } + if (restoreUserKeys) { + backupKeys.push_back_deep(backupKeys.arena(), normalKeys); + } else if (restoreSystemKeys) { + for (const auto& r : getSystemBackupRanges()) { + backupKeys.push_back_deep(backupKeys.arena(), r); + } + } + switch (programExe) { case ProgramExe::AGENT: if (!initCluster()) diff --git a/fdbcli/QuotaCommand.actor.cpp b/fdbcli/QuotaCommand.actor.cpp index 5f4f17418c..79d5fa5301 100644 --- a/fdbcli/QuotaCommand.actor.cpp +++ b/fdbcli/QuotaCommand.actor.cpp @@ -93,8 +93,12 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy } else if (limitType == LimitType::RESERVED) { quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1; } + if (!quota.isValid()) { + throw invalid_throttle_quota_value(); + } ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota); wait(safeThreadFutureToFuture(tr->commit())); + fmt::print("Successfully updated quota.\n"); return Void(); } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); @@ -109,6 +113,7 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) { try { tr->clear(ThrottleApi::getTagQuotaKey(tag)); wait(safeThreadFutureToFuture(tr->commit())); + fmt::print("Successfully cleared quota.\n"); return Void(); } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index c672ac52fb..c2473e1f59 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -1480,6 +1480,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo if (isCommitDesc && tokens.size() == 1) { // prompt for description and add to txn state Optional<std::string> raw; + warn.cancel(); while (!raw.present() || raw.get().empty()) { fprintf(stdout, "Please set a description for the change. Description must be non-empty.\n"); @@ -1490,6 +1491,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo std::string line = raw.get(); config_tr->set("\xff\xff/description"_sr, line); } + warn = + checkStatus(timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb); if (transtype == TransType::Db) { wait(commitTransaction(tr)); } else { @@ -1821,6 +1824,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo if (!intrans) { // prompt for description and add to txn state Optional<std::string> raw_desc; + warn.cancel(); while (!raw_desc.present() || raw_desc.get().empty()) { fprintf(stdout, "Please set a description for the change. Description must be non-empty\n"); @@ -1830,6 +1834,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo } std::string line = raw_desc.get(); config_tr->set("\xff\xff/description"_sr, line); + warn = checkStatus( + timeWarning(5.0, "\nWARNING: Long delay (Ctrl-C to interrupt)\n"), db, localDb); wait(commitTransaction(config_tr)); } else { isCommitDesc = true; diff --git a/fdbcli/tests/fdbcli_tests.py b/fdbcli/tests/fdbcli_tests.py index 9dc159cf19..d4451f9f8d 100755 --- a/fdbcli/tests/fdbcli_tests.py +++ b/fdbcli/tests/fdbcli_tests.py @@ -109,7 +109,7 @@ def quota(logger): command = 'quota clear green' output = run_fdbcli_command(command) logger.debug(command + ' : ' + output) - assert output == '' + assert output == 'Successfully cleared quota.' command = 'quota get green total_throughput' output = run_fdbcli_command(command) @@ -120,17 +120,17 @@ def quota(logger): command = 'quota set red total_throughput 49152' output = run_fdbcli_command(command) logger.debug(command + ' : ' + output) - assert output == '' + assert output == 'Successfully updated quota.' command = 'quota set green total_throughput 32768' output = run_fdbcli_command(command) logger.debug(command + ' : ' + output) - assert output == '' + assert output == 'Successfully updated quota.' command = 'quota set green reserved_throughput 16384' output = run_fdbcli_command(command) logger.debug(command + ' : ' + output) - assert output == '' + assert output == 'Successfully updated quota.' command = 'quota get green total_throughput' output = run_fdbcli_command(command) @@ -145,7 +145,7 @@ def quota(logger): command = 'quota clear green' output = run_fdbcli_command(command) logger.debug(command + ' : ' + output) - assert output == '' + assert output == 'Successfully cleared quota.' command = 'quota get green total_throughput' output = run_fdbcli_command(command) diff --git a/fdbclient/BackupContainerLocalDirectory.actor.cpp b/fdbclient/BackupContainerLocalDirectory.actor.cpp index 51abc24678..0849f7f701 100644 --- a/fdbclient/BackupContainerLocalDirectory.actor.cpp +++ b/fdbclient/BackupContainerLocalDirectory.actor.cpp @@ -63,7 +63,7 @@ public: m_buffer = Standalone<VectorRef<uint8_t>>(old.slice(size, old.size())); // Write the old buffer to the underlying file and update the write offset - Future<Void> r = holdWhile(old, m_file->write(old.begin(), size, m_writeOffset)); + Future<Void> r = uncancellable(holdWhile(old, m_file->write(old.begin(), size, m_writeOffset))); m_writeOffset += size; return r; diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index 66be4c1462..e90b75a67f 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -1057,6 +1057,9 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg beginVersion <= delta.clearVersion.get(); if (delta.values.empty()) { return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } else if (readVersion >= delta.values.back().version && beginVersion <= delta.values.back().version) { + // for all but zero or one delta files, readVersion >= the entire delta file. optimize this case + return ParsedDeltaBoundaryRef(delta.key, clearAfter, delta.values.back()); } auto valueAtVersion = std::lower_bound(delta.values.begin(), delta.values.end(), @@ -1338,6 +1341,10 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk, std::set<int16_t, std::greater<int16_t>> activeClears; int16_t maxActiveClear = -1; + // trade off memory for cpu performance by assuming all inserts + RangeResult result; + int maxExpectedSize = 0; + // check if a given stream is actively clearing bool clearActive[streams.size()]; for (int16_t i = 0; i < streams.size(); i++) { @@ -1355,14 +1362,12 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk, item.streamIdx = i; item.dataIdx = 0; next.push(item); + maxExpectedSize += streams[i].size(); + result.arena().dependsOn(streams[i].arena()); } } + result.reserve(result.arena(), maxExpectedSize); - if (chunk.snapshotFile.present()) { - stats.snapshotRows += streams[0].size(); - } - - RangeResult result; std::vector<MergeStreamNext> cur; cur.reserve(streams.size()); while (!next.empty()) { @@ -1397,7 +1402,7 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk, if (v.isSet() && maxActiveClear < it.streamIdx) { KeyRef finalKey = chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key; - result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value)); + result.push_back(result.arena(), KeyValueRef(finalKey, v.value)); if (!includesSnapshot) { stats.rowsInserted++; } else if (it.streamIdx > 0) { @@ -1426,11 +1431,39 @@ static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk, } } + // FIXME: if memory assumption was wrong and result is significantly smaller than total input size, could copy it + // with push_back_deep to a new result. This is rare though + stats.outputBytes += result.expectedSize(); return result; } +RangeResult materializeJustSnapshot(const BlobGranuleChunkRef& chunk, + Optional<StringRef> snapshotData, + const KeyRange& requestRange, + GranuleMaterializeStats& stats) { + stats.inputBytes += snapshotData.get().size(); + + Standalone<VectorRef<ParsedDeltaBoundaryRef>> snapshotRows = loadSnapshotFile( + chunk.snapshotFile.get().filename, snapshotData.get(), requestRange, chunk.snapshotFile.get().cipherKeysCtx); + RangeResult result; + if (!snapshotRows.empty()) { + result.arena().dependsOn(snapshotRows.arena()); + result.reserve(result.arena(), snapshotRows.size()); + for (auto& it : snapshotRows) { + // TODO REMOVE validation + ASSERT(it.op == MutationRef::Type::SetValue); + KeyRef finalKey = chunk.tenantPrefix.present() ? it.key.removePrefix(chunk.tenantPrefix.get()) : it.key; + result.push_back(result.arena(), KeyValueRef(finalKey, it.value)); + } + stats.outputBytes += result.expectedSize(); + stats.snapshotRows += result.size(); + } + + return result; +} + RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, KeyRangeRef keyRange, Version beginVersion, @@ -1454,6 +1487,11 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, requestRange = keyRange; } + // fast case for only-snapshot read + if (chunk.snapshotFile.present() && chunk.deltaFiles.empty() && chunk.newDeltas.empty()) { + return materializeJustSnapshot(chunk, snapshotData, requestRange, stats); + } + std::vector<Standalone<VectorRef<ParsedDeltaBoundaryRef>>> streams; std::vector<bool> startClears; // +1 for possible snapshot, +1 for possible memory deltas @@ -1471,7 +1509,10 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, streams.push_back(snapshotRows); startClears.push_back(false); arena.dependsOn(streams.back().arena()); + stats.snapshotRows += snapshotRows.size(); } + } else { + ASSERT(!chunk.snapshotFile.present()); } if (BG_READ_DEBUG) { @@ -2675,6 +2716,14 @@ struct CommonPrefixStats { int totalKeys = 0; int minKeySize = 1000000000; int maxKeySize = 0; + int64_t logicalBytes = 0; + int64_t totalLogicalBytes = 0; + + int deltas = 0; + int deltasSet = 0; + int deltasClear = 0; + int deltasNoOp = 0; + int deltasClearAfter = 0; void addKey(const KeyRef& k) { if (len == -1) { @@ -2689,7 +2738,38 @@ struct CommonPrefixStats { maxKeySize = std::max(maxKeySize, k.size()); } + void addKeyValue(const KeyRef& k, const ValueRef& v) { + addKey(k); + logicalBytes += k.size(); + logicalBytes += v.size(); + } + + void addBoundary(const ParsedDeltaBoundaryRef& d) { + addKey(d.key); + + deltas++; + if (d.isSet()) { + deltasSet++; + logicalBytes += d.value.size(); + } else if (d.isClear()) { + deltasClear++; + } else { + ASSERT(d.isNoOp()); + deltasNoOp++; + } + if (d.clearAfter) { + deltasClearAfter++; + } + } + + void doneFile() { + totalLogicalBytes += logicalBytes; + fmt::print("Logical Size: {0}\n", logicalBytes); + logicalBytes = 0; + } + Key done() { + doneFile(); ASSERT(len >= 0); fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key " "Size: {4}\n", @@ -2698,11 +2778,21 @@ struct CommonPrefixStats { totalKeySize / totalKeys, minKeySize, maxKeySize); + + if (deltas > 0) { + fmt::print("Delta stats: {0} deltas, {1} sets, {2} clears, {3} noops, {4} clearAfters\n", + deltas, + deltasSet, + deltasClear, + deltasNoOp, + deltasClearAfter); + } + fmt::print("Logical Size: {0}\n", totalLogicalBytes); return key.substr(0, len); } }; -FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames) { +FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filenames, bool newFormat) { FileSet files; CommonPrefixStats stats; for (int i = 0; i < filenames.size(); i++) { @@ -2713,40 +2803,66 @@ FileSet loadFileSet(std::string basePath, const std::vector<std::string>& filena std::string fpath = basePath + filenames[i]; Value data = loadFileData(fpath); - Arena arena; - GranuleSnapshot file; - ObjectReader dataReader(data.begin(), Unversioned()); - dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena); - Standalone<GranuleSnapshot> parsed(file, arena); + Standalone<GranuleSnapshot> parsed; + if (!newFormat) { + Arena arena; + GranuleSnapshot file; + ObjectReader dataReader(data.begin(), Unversioned()); + dataReader.deserialize(FileIdentifierFor<GranuleSnapshot>::value, file, arena); + parsed = Standalone<GranuleSnapshot>(file, arena); + fmt::print("Loaded {0} rows from snapshot file\n", parsed.size()); + + for (auto& it : parsed) { + stats.addKeyValue(it.key, it.value); + } + } else { + Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = loadSnapshotFile(""_sr, data, normalKeys, {}); + fmt::print("Loaded {0} rows from snapshot file\n", res.size()); + for (auto& it : res) { + stats.addKeyValue(it.key, it.value); + } + } - fmt::print("Loaded {0} rows from snapshot file\n", parsed.size()); files.snapshotFile = { filenames[i], version, data, parsed }; - for (auto& it : parsed) { - stats.addKey(it.key); - } } else { std::string fpath = basePath + filenames[i]; Value data = loadFileData(fpath); - Arena arena; - GranuleDeltas file; - ObjectReader dataReader(data.begin(), Unversioned()); - dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena); - Standalone<GranuleDeltas> parsed(file, arena); + if (!newFormat) { + Arena arena; + GranuleDeltas file; + ObjectReader dataReader(data.begin(), Unversioned()); + dataReader.deserialize(FileIdentifierFor<GranuleDeltas>::value, file, arena); + Standalone<GranuleDeltas> parsed(file, arena); - fmt::print("Loaded {0} deltas from delta file\n", parsed.size()); - files.deltaFiles.push_back({ filenames[i], version, data, parsed }); + fmt::print("Loaded {0} deltas from delta file\n", parsed.size()); + files.deltaFiles.push_back({ filenames[i], version, data, parsed }); - for (auto& it : parsed) { - for (auto& it2 : it.mutations) { - stats.addKey(it2.param1); - if (it2.type == MutationRef::Type::ClearRange) { - stats.addKey(it2.param2); + for (auto& it : parsed) { + for (auto& it2 : it.mutations) { + stats.addKey(it2.param1); + if (it2.type == MutationRef::Type::ClearRange) { + stats.addKey(it2.param2); + } } } + } else { + bool startClear = false; + Standalone<VectorRef<ParsedDeltaBoundaryRef>> res = + loadChunkedDeltaFile(""_sr, data, normalKeys, 0, version, {}, startClear); + ASSERT(!startClear); + + Standalone<GranuleDeltas> parsed; + fmt::print("Loaded {0} boundaries from delta file\n", res.size()); + files.deltaFiles.push_back({ filenames[i], version, data, parsed }); + + for (auto& it : res) { + stats.addBoundary(it); + } } } + stats.doneFile(); } files.commonPrefix = stats.done(); @@ -2804,6 +2920,28 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da return { serializedBytes, elapsed }; } +void chunkFromFileSet(const FileSet& fileSet, + Standalone<BlobGranuleChunkRef>& chunk, + StringRef* deltaPtrs, + Version readVersion, + Optional<BlobGranuleCipherKeysCtx> keys, + int numDeltaFiles) { + size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size(); + chunk.snapshotFile = + BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys); + + for (int i = 0; i < numDeltaFiles; i++) { + size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size(); + chunk.deltaFiles.emplace_back_deep( + chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys); + deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]); + } + + chunk.keyRange = fileSet.range; + chunk.includedVersion = readVersion; + chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile); +} + FileSet rewriteChunkedFileSet(const FileSet& fileSet, Optional<BlobGranuleCipherKeysCtx> keys, Optional<CompressionFilter> compressionFilter) { @@ -2830,41 +2968,30 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet, KeyRange readRange, bool clearAllAtEnd, Optional<BlobGranuleCipherKeysCtx> keys, - Optional<CompressionFilter> compressionFilter) { + int numDeltaFiles, + bool printStats = false) { Version readVersion = std::get<1>(fileSet.deltaFiles.back()); Standalone<BlobGranuleChunkRef> chunk; GranuleMaterializeStats stats; - StringRef deltaPtrs[fileSet.deltaFiles.size()]; + ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size()); + StringRef deltaPtrs[numDeltaFiles]; MutationRef clearAllAtEndMutation; if (clearAllAtEnd) { clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end); } if (chunked) { - size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size(); - chunk.snapshotFile = - BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys); - - for (int i = 0; i < fileSet.deltaFiles.size(); i++) { - size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size(); - chunk.deltaFiles.emplace_back_deep( - chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys); - deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]); - } - + chunkFromFileSet(fileSet, chunk, deltaPtrs, readVersion, keys, numDeltaFiles); if (clearAllAtEnd) { readVersion++; MutationsAndVersionRef lastDelta; lastDelta.version = readVersion; lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation); + chunk.includedVersion = readVersion; chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta); } - - chunk.keyRange = fileSet.range; - chunk.includedVersion = readVersion; - chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile); } int64_t serializedBytes = 0; @@ -2897,15 +3024,16 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet, elapsed /= READ_RUNS; serializedBytes /= READ_RUNS; - // TODO REMOVE - fmt::print("Materialize stats:\n"); - fmt::print(" Input bytes: {0}\n", stats.inputBytes); - fmt::print(" Output bytes: {0}\n", stats.outputBytes); - fmt::print(" Write Amp: {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes); - fmt::print(" Snapshot Rows: {0}\n", stats.snapshotRows); - fmt::print(" Rows Cleared: {0}\n", stats.rowsCleared); - fmt::print(" Rows Inserted: {0}\n", stats.rowsInserted); - fmt::print(" Rows Updated: {0}\n", stats.rowsUpdated); + if (printStats) { + fmt::print("Materialize stats:\n"); + fmt::print(" Input bytes: {0}\n", stats.inputBytes / READ_RUNS); + fmt::print(" Output bytes: {0}\n", stats.outputBytes / READ_RUNS); + fmt::print(" Write Amp: {0}\n", (1.0 * stats.inputBytes) / stats.outputBytes); + fmt::print(" Snapshot Rows: {0}\n", stats.snapshotRows / READ_RUNS); + fmt::print(" Rows Cleared: {0}\n", stats.rowsCleared / READ_RUNS); + fmt::print(" Rows Inserted: {0}\n", stats.rowsInserted / READ_RUNS); + fmt::print(" Rows Updated: {0}\n", stats.rowsUpdated / READ_RUNS); + } return { serializedBytes, elapsed }; } @@ -2937,7 +3065,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { int64_t logicalSnapshotSize = 0; int64_t logicalDeltaSize = 0; for (auto& it : fileSetNames) { - FileSet fileSet = loadFileSet(basePath, it); + FileSet fileSet = loadFileSet(basePath, it, false); fileSets.push_back(fileSet); logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize(); for (auto& deltaFile : fileSet.deltaFiles) { @@ -2968,7 +3096,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { if (encrypt) { name += "ENC"; } - if (compressionFilter.present()) { + if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) { name += "CMP"; } if (name.empty()) { @@ -3024,9 +3152,16 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { std::vector<std::string> readRunNames = {}; std::vector<std::pair<int64_t, double>> readMetrics; - bool doEdgeCaseReadTests = true; + bool doEdgeCaseReadTests = false; + bool doVaryingDeltaTests = false; std::vector<double> clearAllReadMetrics; std::vector<double> readSingleKeyMetrics; + std::vector<std::vector<std::pair<int64_t, double>>> varyingDeltaMetrics; + + size_t maxDeltaFiles = 100000; + for (auto& f : fileSets) { + maxDeltaFiles = std::min(maxDeltaFiles, f.deltaFiles.size()); + } for (bool chunk : chunkModes) { for (bool encrypt : encryptionModes) { @@ -3049,7 +3184,7 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { if (encrypt) { name += "ENC"; } - if (compressionFilter.present()) { + if (compressionFilter.present() && compressionFilter.get() != CompressionFilter::NONE) { name += "CMP"; } if (name.empty()) { @@ -3062,6 +3197,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { double totalElapsed = 0.0; double totalElapsedClearAll = 0.0; double totalElapsedSingleKey = 0.0; + std::vector<std::pair<int64_t, double>> varyingDeltas; + for (int i = 0; i <= maxDeltaFiles; i++) { + varyingDeltas.push_back({ 0, 0.0 }); + } for (auto& fileSet : fileSets) { FileSet newFileSet; if (!chunk) { @@ -3070,24 +3209,38 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter); } - auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter); + auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, newFileSet.deltaFiles.size()); totalBytesRead += res.first; totalElapsed += res.second; if (doEdgeCaseReadTests) { totalElapsedClearAll += - doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second; + doReadBench(newFileSet, chunk, fileSet.range, true, keys, newFileSet.deltaFiles.size()) + .second; Key k = std::get<3>(fileSet.snapshotFile).front().key; KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k))); totalElapsedSingleKey += - doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second; + doReadBench(newFileSet, chunk, singleKeyRange, false, keys, newFileSet.deltaFiles.size()) + .second; + } + + if (doVaryingDeltaTests && chunk) { + for (int i = 0; i <= maxDeltaFiles; i++) { + auto r = doReadBench(newFileSet, chunk, fileSet.range, false, keys, i); + varyingDeltas[i].first += r.first; + varyingDeltas[i].second += r.second; + } } } readMetrics.push_back({ totalBytesRead, totalElapsed }); + if (doEdgeCaseReadTests) { clearAllReadMetrics.push_back(totalElapsedClearAll); readSingleKeyMetrics.push_back(totalElapsedSingleKey); } + if (doVaryingDeltaTests) { + varyingDeltaMetrics.push_back(varyingDeltas); + } } } } @@ -3121,6 +3274,25 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { } } + if (doVaryingDeltaTests) { + ASSERT(readRunNames.size() == varyingDeltaMetrics.size()); + fmt::print("\n\nVarying Deltas Read Results:\nDF#\t"); + for (int i = 0; i <= maxDeltaFiles; i++) { + fmt::print("{0}\t", i); + } + fmt::print("\n"); + + for (int i = 0; i < readRunNames.size(); i++) { + fmt::print("{0}", readRunNames[i]); + + for (auto& it : varyingDeltaMetrics[i]) { + double MBperCPUsec = (it.first / 1024.0 / 1024.0) / it.second; + fmt::print("\t{:.6}", MBperCPUsec); + } + fmt::print("\n"); + } + } + fmt::print("\n\nCombined Results:\n"); ASSERT(readRunNames.size() == runNames.size() - 1); for (int i = 0; i < readRunNames.size(); i++) { @@ -3137,3 +3309,22 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { return Void(); } + +TEST_CASE("!/blobgranule/files/repeatFromFiles") { + std::string basePath = "SET_ME"; + std::vector<std::vector<std::string>> fileSetNames = { { "SET_ME" } }; + + int64_t totalBytesRead = 0; + double totalElapsed = 0.0; + for (auto& it : fileSetNames) { + FileSet fileSet = loadFileSet(basePath, it, true); + auto res = doReadBench(fileSet, true, fileSet.range, false, {}, fileSet.deltaFiles.size(), true); + totalBytesRead += res.first; + totalElapsed += res.second; + } + + double MBperCPUsec = (totalBytesRead / 1024.0 / 1024.0) / totalElapsed; + fmt::print("Read Results: {:.6} MB/cpusec\n", MBperCPUsec); + + return Void(); +} diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index 9ba1ccffdb..09c2f5a050 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -142,7 +142,6 @@ bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRe for (const BlobGranuleChunkRef& chunk : blobChunks) { blobRanges.push_back(chunk.keyRange); } - return range.isCovered(blobRanges); } @@ -194,7 +193,7 @@ TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") { testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks); testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks); testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks); - ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false); + ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks)); } return Void(); } diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index cc2b0935fd..52bb607d8d 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -1040,13 +1040,10 @@ private: Key lastValue; }; -ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader, - Standalone<VectorRef<KeyValueRef>>* results, - bool encryptedBlock, - Optional<Database> cx) { +void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) { // Read begin key, if this fails then block was invalid. - state uint32_t kLen = reader->consumeNetworkUInt32(); - state const uint8_t* k = reader->consume(kLen); + uint32_t kLen = reader->consumeNetworkUInt32(); + const uint8_t* k = reader->consume(kLen); results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); // Read kv pairs and end key @@ -1075,7 +1072,6 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader, for (auto b : reader->remainder()) if (b != 0xFF) throw restore_corrupted_data_padding(); - return Void(); } ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file, @@ -1083,7 +1079,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference< int len, Optional<Database> cx) { state Standalone<StringRef> buf = makeString(len); - int rLen = wait(file->read(mutateString(buf), len, offset)); + int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset)))); if (rLen != len) throw restore_bad_read(); @@ -1098,7 +1094,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference< // BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION int32_t file_version = reader.consume<int32_t>(); if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) { - wait(decodeKVPairs(&reader, &results, false, cx)); + decodeKVPairs(&reader, &results); } else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { CODE_PROBE(true, "decoding encrypted block"); ASSERT(cx.present()); @@ -1121,7 +1117,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference< StringRef decryptedData = wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena())); reader = StringRefReader(decryptedData, restore_corrupted_data()); - wait(decodeKVPairs(&reader, &results, true, cx)); + decodeKVPairs(&reader, &results); } else { throw restore_unsupported_file_version(); } @@ -1704,7 +1700,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { state std::unique_ptr<IRangeFileWriter> rangeFile; state BackupConfig backup(task); state Arena arena; - state Reference<TenantEntryCache<Void>> tenantCache = makeReference<TenantEntryCache<Void>>(cx); + state Reference<TenantEntryCache<Void>> tenantCache; // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but // if bc is false then clearly the backup is no longer in progress @@ -1798,6 +1794,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { // Initialize range file writer and write begin key if (encryptionEnabled) { CODE_PROBE(true, "using encrypted snapshot file writer"); + if (!tenantCache.isValid()) { + tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH); + wait(tenantCache->init()); + } rangeFile = std::make_unique<EncryptedRangeFileWriter>(cx, &arena, tenantCache, outFile, blockSize); } else { rangeFile = std::make_unique<RangeFileWriter>(outFile, blockSize); diff --git a/fdbclient/IdempotencyId.cpp b/fdbclient/IdempotencyId.cpp index 69f4a9a136..eaba38ed34 100644 --- a/fdbclient/IdempotencyId.cpp +++ b/fdbclient/IdempotencyId.cpp @@ -122,6 +122,7 @@ IdempotencyIdRef generate(Arena& arena) { TEST_CASE("/fdbclient/IdempotencyId/basic") { Arena arena; uint16_t firstBatchIndex = deterministicRandom()->randomUInt32(); + firstBatchIndex &= 0xff7f; // ensure firstBatchIndex+5 won't change the higher order byte uint16_t batchIndex = firstBatchIndex; Version commitVersion = deterministicRandom()->randomInt64(0, std::numeric_limits<Version>::max()); std::vector<IdempotencyIdRef> idVector; // Reference diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 665fbd9274..d4382ee086 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -2559,19 +2559,19 @@ bool schemaMatch(json_spirit::mValue const& schemaValue, } } -void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) { +void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); auto key = storageQuotaKey(tenantName); - tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned())); + tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned())); } -ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) { +ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName))); if (!v.present()) { - return Optional<uint64_t>(); + return Optional<int64_t>(); } - return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned()); + return BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned()); } std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) { diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index bb860a1781..8dda15b584 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -39,11 +39,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ENABLE_VERSION_VECTOR, false ); init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false ); - bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR; + bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR; init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (buggifyShortReadWindow) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND; init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND); init( MAX_COMMIT_BATCH_INTERVAL, 2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough + MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_WRITE_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_WRITE_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough init( MAX_VERSION_RATE_MODIFIER, 0.1 ); init( MAX_VERSION_RATE_OFFSET, VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps. init( ENABLE_VERSION_VECTOR_HA_OPTIMIZATION, false ); @@ -296,7 +297,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_TENANT_AWARENESS_ENABLED, false ); init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); - + init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true @@ -420,6 +422,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // Enable this knob only for experminatal purpose, never enable this in production. // If enabled, all the committed in-memory memtable writes are lost on a crash. init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false ); + // If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob. + // These knobs have contrary functionality. + init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true; + init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB // Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for // ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded. // Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable @@ -787,7 +793,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1; init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1; init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1; - init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); + init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true; init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 ); init( QUICK_GET_VALUE_FALLBACK, true ); init( QUICK_GET_KEY_VALUES_FALLBACK, true ); diff --git a/fdbclient/include/fdbclient/BlobMetadataUtils.h b/fdbclient/include/fdbclient/BlobMetadataUtils.h index 62655b781c..3f486b0bb6 100644 --- a/fdbclient/include/fdbclient/BlobMetadataUtils.h +++ b/fdbclient/include/fdbclient/BlobMetadataUtils.h @@ -62,8 +62,8 @@ struct BlobMetadataDetailsRef { BlobMetadataDomainNameRef domainName, Optional<StringRef> base, VectorRef<StringRef> partitions, - int64_t refreshAt, - int64_t expireAt) + double refreshAt, + double expireAt) : domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt), expireAt(expireAt) { if (base.present()) { diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index 596f6be1e2..1e837bb33f 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -336,12 +336,13 @@ struct KeyRangeRef { bool isCovered(std::vector<KeyRangeRef>& ranges) { ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder())); KeyRangeRef clone(begin, end); + for (auto r : ranges) { - if (begin < r.begin) + if (clone.begin < r.begin) return false; // uncovered gap between clone.begin and r.begin - if (end <= r.end) + if (clone.end <= r.end) return true; // range is fully covered - if (end > r.begin) + if (clone.end > r.begin) // {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end} clone = KeyRangeRef(r.end, clone.end); } @@ -1402,6 +1403,25 @@ struct TenantMode { serializer(ar, mode); } + // This does not go back-and-forth cleanly with toString + // The '_experimental' suffix, if present, needs to be removed in order to be parsed. + static TenantMode fromString(std::string mode) { + if (mode.find("_experimental") != std::string::npos) { + mode.replace(mode.find("_experimental"), std::string::npos, ""); + } + if (mode == "disabled") { + return TenantMode::DISABLED; + } else if (mode == "optional") { + return TenantMode::OPTIONAL_TENANT; + } else if (mode == "required") { + return TenantMode::REQUIRED; + } else { + TraceEvent(SevError, "UnknownTenantMode").detail("TenantMode", mode); + ASSERT(false); + throw internal_error(); + } + } + std::string toString() const { switch (mode) { case DISABLED: @@ -1686,10 +1706,20 @@ struct Versionstamp { serializer(ar, beVersion, beBatch); if constexpr (Ar::isDeserializing) { - version = bigEndian64(version); + version = bigEndian64(beVersion); batchNumber = bigEndian16(beBatch); } } }; +template <class Ar> +inline void save(Ar& ar, const Versionstamp& value) { + return const_cast<Versionstamp&>(value).serialize(ar); +} + +template <class Ar> +inline void load(Ar& ar, Versionstamp& value) { + value.serialize(ar); +} + #endif diff --git a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h index 257aeea723..6f82b3bba1 100644 --- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h +++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h @@ -104,6 +104,11 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL // Collect cached cipher keys. for (auto& domain : domains) { + if (domain.first == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { + ASSERT(domain.second == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } else if (domain.first == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) { + ASSERT(domain.second == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); + } Reference<BlobCipherKey> cachedCipherKey = cipherKeyCache->getLatestCipherKey(domain.first /*domainId*/); if (cachedCipherKey.isValid()) { cipherKeys[domain.first] = cachedCipherKey; @@ -301,7 +306,7 @@ template <class T> Future<TextAndHeaderCipherKeys> getLatestSystemEncryptCipherKeys(const Reference<AsyncVar<T> const>& db, BlobCipherMetrics::UsageType usageType) { return getLatestEncryptCipherKeysForDomain( - db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType); + db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME, usageType); } ACTOR template <class T> diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h index 293a90ba07..7446d52484 100644 --- a/fdbclient/include/fdbclient/KeyBackedTypes.h +++ b/fdbclient/include/fdbclient/KeyBackedTypes.h @@ -319,6 +319,11 @@ public: tr->clear(key); } + template <class Transaction> + Future<Void> watch(Transaction tr) { + return tr->watch(key); + } + Key key; }; diff --git a/fdbclient/include/fdbclient/ManagementAPI.actor.h b/fdbclient/include/fdbclient/ManagementAPI.actor.h index bd19da06f6..e220f0b156 100644 --- a/fdbclient/include/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h @@ -164,8 +164,8 @@ bool schemaMatch(json_spirit::mValue const& schema, ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID); // Set and get the storage quota per tenant -void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota); -ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName); +void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota); +ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName); #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 7fce05e810..91a17a8b88 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -502,6 +502,7 @@ Future<Void> decommissionMetacluster(Reference<DB> db) { ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr); ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr); ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.clear(tr); wait(managementClusterCheckEmpty(tr)); MetaclusterMetadata::metaclusterRegistration().clear(tr); @@ -797,6 +798,7 @@ struct RemoveClusterImpl { ASSERT(entry.getString(0) == self->ctx.clusterName.get()); ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1)); ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2)); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); } // Erase all of the tenants processed in this transaction from the cluster tenant index @@ -1262,6 +1264,7 @@ struct CreateTenantImpl { self->tenantEntry.tenantState = TenantState::REGISTERING; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry); ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); ManagementClusterMetadata::clusterTenantCount.atomicOp( @@ -1317,6 +1320,7 @@ struct CreateTenantImpl { TenantMapEntry updatedEntry = managementEntry.get(); updatedEntry.tenantState = TenantState::READY; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); } return Void(); @@ -1446,6 +1450,7 @@ struct DeleteTenantImpl { } updatedEntry.tenantState = TenantState::REMOVING; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); // If this has a rename pair, also mark the other entry for deletion if (self->pairName.present()) { state Optional<TenantMapEntry> pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get())); @@ -1457,6 +1462,8 @@ struct DeleteTenantImpl { CODE_PROBE(true, "marking pair tenant in removing state"); updatedPairEntry.tenantState = TenantState::REMOVING; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp( + tr, Versionstamp(), 0); } } @@ -1485,6 +1492,7 @@ struct DeleteTenantImpl { // Erase the tenant entry itself ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName); ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); // This is idempotent because this function is only called if the tenant is in the map ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); @@ -1689,6 +1697,7 @@ struct ConfigureTenantImpl { ++self->updatedEntry.configurationSequenceNum; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); return Void(); } @@ -1724,6 +1733,7 @@ struct ConfigureTenantImpl { tenantEntry.get().tenantState = TenantState::READY; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get()); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); return Void(); } @@ -1770,6 +1780,7 @@ struct RenameTenantImpl { TenantMapEntry tenantEntry) { // Erase the tenant entry itself ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); // Remove old tenant from tenant count ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); @@ -1857,6 +1868,7 @@ struct RenameTenantImpl { ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry); ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); // Add temporary tenant to tenantCount to prevent exceeding capacity during a rename ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); @@ -1919,6 +1931,7 @@ struct RenameTenantImpl { updatedNewEntry.renamePair.reset(); ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); } // We will remove the old entry from the management cluster diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index b1631aec44..bfa48e8b09 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -237,6 +237,10 @@ public: DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled bool DD_TENANT_AWARENESS_ENABLED; int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed + int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed + // in the TenantCache + int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is + // refreshed in the TenantCache // TeamRemover to remove redundant teams bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor @@ -343,6 +347,8 @@ public: int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD; int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD; bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL; + bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE; + int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT; int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE; int64_t ROCKSDB_BLOCK_SIZE; bool ENABLE_SHARDED_ROCKSDB; diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index 47d27a0f72..87e1731e90 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -181,6 +181,7 @@ struct TenantMetadataSpecification { KeyBackedObjectProperty<TenantTombstoneCleanupData, decltype(IncludeVersion())> tombstoneCleanupData; KeyBackedSet<Tuple> tenantGroupTenantIndex; KeyBackedObjectMap<TenantGroupName, TenantGroupEntry, decltype(IncludeVersion()), NullCodec> tenantGroupMap; + KeyBackedBinaryValue<Versionstamp> lastTenantModification; TenantMetadataSpecification(KeyRef prefix) : subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()), @@ -188,7 +189,8 @@ struct TenantMetadataSpecification { tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)), tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()), tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)), - tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {} + tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()), + lastTenantModification(subspace.withSuffix("lastModification"_sr)) {} }; struct TenantMetadata { @@ -203,6 +205,7 @@ struct TenantMetadata { static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; } static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; } static inline auto& tenantGroupMap() { return instance().tenantGroupMap; } + static inline auto& lastTenantModification() { return instance().lastTenantModification; } static Key tenantMapPrivatePrefix(); }; diff --git a/fdbclient/include/fdbclient/TenantEntryCache.actor.h b/fdbclient/include/fdbclient/TenantEntryCache.actor.h index cd35c5a985..4ff5438d5a 100644 --- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h +++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h @@ -44,8 +44,14 @@ using TenantNameEntryPair = std::pair<TenantName, TenantMapEntry>; using TenantNameEntryPairVec = std::vector<TenantNameEntryPair>; -enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 }; -enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 }; +enum class TenantEntryCacheRefreshReason { + INIT = 1, + PERIODIC_TASK = 2, + CACHE_MISS = 3, + REMOVE_ENTRY = 4, + WATCH_TRIGGER = 5 +}; +enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, WATCH = 2, NONE = 3 }; template <class T> struct TenantEntryCachePayload { @@ -62,12 +68,6 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con // 1. Lookup by 'TenantId' // 2. Lookup by 'TenantPrefix' // 3. Lookup by 'TenantName' -// -// TODO: -// ---- -// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic -// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement -// database range-watch to monitor such updates template <class T> class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable { @@ -78,6 +78,10 @@ private: TenantEntryCacheRefreshMode refreshMode; Future<Void> refresher; + Future<Void> watchRefresher; + Future<Void> lastTenantIdRefresher; + Promise<Void> setInitialWatch; + Optional<int64_t> lastTenantId; Map<int64_t, TenantEntryCachePayload<T>> mapByTenantId; Map<TenantName, TenantEntryCachePayload<T>> mapByTenantName; @@ -87,6 +91,7 @@ private: Counter refreshByCacheInit; Counter refreshByCacheMiss; Counter numRefreshes; + Counter refreshByWatchTrigger; ACTOR static Future<TenantNameEntryPairVec> getTenantList(Reference<ReadYourWritesTransaction> tr) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -102,16 +107,166 @@ private: return tenantList.results; } + ACTOR static Future<Void> refreshCacheById(int64_t tenantId, + TenantEntryCache<T>* cache, + TenantEntryCacheRefreshReason reason) { + TraceEvent(SevDebug, "TenantEntryCacheIDRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason)); + state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + state Optional<TenantName> name = wait(TenantMetadata::tenantIdIndex().get(tr, tenantId)); + if (name.present()) { + Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name.get())); + if (entry.present()) { + cache->put(std::make_pair(name.get(), entry.get())); + updateCacheRefreshMetrics(cache, reason); + } + } + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + TraceEvent(SevDebug, "TenantEntryCacheIDRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason)); + return Void(); + } + + ACTOR static Future<Void> refreshCacheByName(TenantName name, + TenantEntryCache<T>* cache, + TenantEntryCacheRefreshReason reason) { + TraceEvent(SevDebug, "TenantEntryCacheNameRefreshStart", cache->id()) + .detail("Reason", static_cast<int>(reason)); + state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional<TenantMapEntry> entry = wait(TenantMetadata::tenantMap().get(tr, name)); + if (entry.present()) { + cache->put(std::make_pair(name, entry.get())); + updateCacheRefreshMetrics(cache, reason); + } + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + TraceEvent(SevDebug, "TenantEntryCacheNameRefreshEnd", cache->id()).detail("Reason", static_cast<int>(reason)); + return Void(); + } + static void updateCacheRefreshMetrics(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) { if (reason == TenantEntryCacheRefreshReason::INIT) { cache->refreshByCacheInit += 1; } else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) { cache->refreshByCacheMiss += 1; + } else if (reason == TenantEntryCacheRefreshReason::WATCH_TRIGGER) { + cache->refreshByWatchTrigger += 1; } cache->numRefreshes += 1; } + ACTOR static Future<Void> refreshCacheUsingWatch(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) { + TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchStart", cache->id()) + .detail("Reason", static_cast<int>(reason)); + + state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + state Future<Void> tenantModifiedWatch = TenantMetadata::lastTenantModification().watch(tr); + wait(tr->commit()); + TraceEvent(SevDebug, "TenantEntryCacheRefreshWatchSet", cache->id()); + // setInitialWatch is set to indicate that an inital watch has been set for the lastTenantModification + // key. Currently this is only used in simulation to avoid a race condition where a tenant is created + // before the inital watch is set. However, it can be enabled by passing waitForInitalWatch = true to + // the init() method. + if (cache->setInitialWatch.canBeSet()) { + cache->setInitialWatch.send(Void()); + } + wait(tenantModifiedWatch); + // If watch triggered then refresh the cache as tenant metadata was updated + TraceEvent(SevDebug, "TenantEntryCacheRefreshUsingWatchTriggered", cache->id()) + .detail("Reason", static_cast<int>(reason)); + wait(refreshImpl(cache, reason)); + tr->reset(); + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled) { + TraceEvent("TenantEntryCacheRefreshUsingWatchError", cache->id()) + .errorUnsuppressed(e) + .suppressFor(1.0); + } + wait(tr->onError(e)); + // In case the watch threw an error then refresh the cache just in case it was updated + wait(refreshImpl(cache, reason)); + } + } + } + + static bool tenantsEnabled(TenantEntryCache<T>* cache) { + // Avoid using the cache if the tenant mode is disabled. However since we use clientInfo, sometimes it may not + // be fully up to date (i.e it may indicate the tenantMode is disabled when in fact it is required). Thus if + // there is at least one tenant that has been created on the cluster then use the cache to avoid an incorrect + // miss. + if (cache->getDatabase()->clientInfo->get().tenantMode == TenantMode::DISABLED) { + if (!cache->lastTenantId.present()) { + return false; + } + return cache->lastTenantId.get() > 0; + } + return true; + } + + ACTOR static Future<Void> setLastTenantId(TenantEntryCache<T>* cache) { + state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + Optional<int64_t> lastTenantId = wait(TenantMetadata::lastTenantId().get(tr)); + cache->lastTenantId = lastTenantId; + return Void(); + } catch (Error& e) { + wait(tr->onError(e)); + } + } + } + + ACTOR static Future<Void> lastTenantIdWatch(TenantEntryCache<T>* cache) { + TraceEvent(SevDebug, "TenantEntryCacheLastTenantIdWatchStart", cache->id()); + // monitor for any changes on the last tenant id and update it as necessary + state Reference<ReadYourWritesTransaction> tr = cache->getDatabase()->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + state Future<Void> lastTenantIdWatch = tr->watch(TenantMetadata::lastTenantId().key); + wait(tr->commit()); + wait(lastTenantIdWatch); + wait(setLastTenantId(cache)); + tr->reset(); + } catch (Error& e) { + state Error err(e); + if (err.code() != error_code_actor_cancelled) { + TraceEvent("TenantEntryCacheLastTenantIdWatchError", cache->id()) + .errorUnsuppressed(err) + .suppressFor(1.0); + // In case watch errors out refresh the lastTenantId in case it has changed or we would have missed + // an update + wait(setLastTenantId(cache)); + } + wait(tr->onError(err)); + } + } + } + ACTOR static Future<Void> refreshImpl(TenantEntryCache<T>* cache, TenantEntryCacheRefreshReason reason) { TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast<int>(reason)); @@ -130,9 +285,7 @@ private: break; } catch (Error& e) { if (e.code() != error_code_actor_cancelled) { - TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id()) - .errorUnsuppressed(e) - .suppressFor(1.0); + TraceEvent("TenantEntryCacheRefreshError", cache->id()).errorUnsuppressed(e).suppressFor(1.0); } wait(tr->onError(e)); } @@ -151,12 +304,22 @@ private: return ret; } - TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId); + if (!tenantsEnabled(cache)) { + // If tenants are disabled on the cluster avoid using the cache + return Optional<TenantEntryCachePayload<T>>(); + } - // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. - // TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any - // existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare - wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + TraceEvent("TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId); + + if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) { + // Entry not found. Do a point refresh + // TODO: Don't initiate refresh if tenantId < maxTenantId (stored as a system key currently) as we know that + // such a tenant does not exist (it has either never existed or has been deleted) + wait(refreshCacheById(tenantId, cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + } else { + // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. + wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + } cache->misses += 1; return cache->lookupById(tenantId); @@ -170,12 +333,20 @@ private: return ret; } + if (!tenantsEnabled(cache)) { + // If tenants are disabled on the cluster avoid using the cache + return Optional<TenantEntryCachePayload<T>>(); + } + TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name); - // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. - // TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any - // existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare - wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + if (cache->refreshMode == TenantEntryCacheRefreshMode::WATCH) { + // Entry not found. Do a point refresh + wait(refreshCacheByName(name, cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + } else { + // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. + wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + } cache->misses += 1; return cache->lookupByName(name); @@ -272,7 +443,18 @@ public: hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), - numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + numRefreshes("TenantEntryCacheNumRefreshes", metrics), + refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) { + TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid); + } + + TenantEntryCache(Database db, TenantEntryCacheRefreshMode mode) + : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload), + refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics), + misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics), + refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) { TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid); } @@ -282,7 +464,8 @@ public: hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), - numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + numRefreshes("TenantEntryCacheNumRefreshes", metrics), + refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) { TraceEvent("TenantEntryCacheCreated", uid); } @@ -291,7 +474,8 @@ public: metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), - numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + numRefreshes("TenantEntryCacheNumRefreshes", metrics), + refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) { TraceEvent("TenantEntryCacheCreated", uid); } @@ -300,26 +484,36 @@ public: hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), - numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + numRefreshes("TenantEntryCacheNumRefreshes", metrics), + refreshByWatchTrigger("TenantEntryCacheRefreshWatchTrigger", metrics) { TraceEvent("TenantEntryCacheCreated", uid); } - Future<Void> init() { + Future<Void> init(bool waitForInitalWatch = false) { TraceEvent("TenantEntryCacheInit", uid); Future<Void> f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT); // Launch reaper task to periodically refresh cache by scanning database KeyRange TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK; + Future<Void> initalWatchFuture = Void(); + lastTenantIdRefresher = lastTenantIdWatch(this); if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) { refresher = recurringAsync([&, reason]() { return refresh(reason); }, CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* interval */ true, /* absoluteIntervalDelay */ CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */ TaskPriority::Worker); + } else if (refreshMode == TenantEntryCacheRefreshMode::WATCH) { + if (waitForInitalWatch) { + initalWatchFuture = setInitialWatch.getFuture(); + } + watchRefresher = refreshCacheUsingWatch(this, TenantEntryCacheRefreshReason::WATCH_TRIGGER); } - return f; + Future<Void> setLastTenant = setLastTenantId(this); + + return f && initalWatchFuture && setLastTenant; } Database getDatabase() const { return db; } @@ -341,28 +535,33 @@ public: } void put(const TenantNameEntryPair& pair) { - TenantEntryCachePayload<T> payload = createPayloadFunc(pair.first, pair.second); - auto idItr = mapByTenantId.find(pair.second.id); - auto nameItr = mapByTenantName.find(pair.first); + const auto& [name, entry] = pair; + TenantEntryCachePayload<T> payload = createPayloadFunc(name, entry); + auto idItr = mapByTenantId.find(entry.id); + auto nameItr = mapByTenantName.find(name); Optional<TenantName> existingName; Optional<int64_t> existingId; if (nameItr != mapByTenantName.end()) { existingId = nameItr->value.entry.id; - mapByTenantId.erase(nameItr->value.entry.id); } if (idItr != mapByTenantId.end()) { existingName = idItr->value.name; - mapByTenantName.erase(idItr->value.name); + } + if (existingId.present()) { + mapByTenantId.erase(existingId.get()); + } + if (existingName.present()) { + mapByTenantName.erase(existingName.get()); } - mapByTenantId[pair.second.id] = payload; - mapByTenantName[pair.first] = payload; + mapByTenantId[entry.id] = payload; + mapByTenantName[name] = payload; TraceEvent("TenantEntryCachePut") - .detail("TenantName", pair.first) + .detail("TenantName", name) .detail("TenantNameExisting", existingName) - .detail("TenantID", pair.second.id) + .detail("TenantID", entry.id) .detail("TenantIDExisting", existingId) .detail("TenantPrefix", pair.second.prefix); @@ -384,7 +583,8 @@ public: Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); } Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); } Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); } + Counter::Value numWatchRefreshes() const { return refreshByWatchTrigger.getValue(); } }; #include "flow/unactorcompiler.h" -#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H \ No newline at end of file +#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index 6e91c8fb90..134c3b7590 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -178,6 +178,7 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction( TenantMetadata::tenantMap().set(tr, name, tenantEntry); TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name); + TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0); if (tenantEntry.tenantGroup.present()) { TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name)); @@ -346,6 +347,7 @@ Future<Void> deleteTenantTransaction(Transaction tr, TenantMetadata::tenantMap().erase(tr, name); TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id); TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue); + TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0); if (tenantEntry.get().tenantGroup.present()) { TenantMetadata::tenantGroupTenantIndex().erase(tr, @@ -420,6 +422,7 @@ Future<Void> configureTenantTransaction(Transaction tr, tr->setOption(FDBTransactionOptions::RAW_ACCESS); TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry); + TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0); // If the tenant group was changed, we need to update the tenant group metadata structures if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) { @@ -523,6 +526,7 @@ Future<Void> renameTenantTransaction(Transaction tr, TenantMetadata::tenantMap().erase(tr, oldName); TenantMetadata::tenantMap().set(tr, newName, oldEntry.get()); TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName); + TenantMetadata::lastTenantModification().setVersionstamp(tr, Versionstamp(), 0); // Update the tenant group index to reflect the new tenant name if (oldEntry.get().tenantGroup.present()) { diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 2d27dc5155..6dbcf6d6c1 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -202,8 +202,9 @@ description is not currently required but encouraged. description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect." defaultFor="23"/> <Option name="transaction_automatic_idempotency" code="506" - description="Set a random idempotency id for all transactions. See the transaction option description for more information." - defaultFor="505"/> + description="Set a random idempotency id for all transactions. See the transaction option description for more information. This feature is in development and not ready for general use." + defaultFor="505" + hidden="true"/> <Option name="transaction_bypass_unreadable" code="700" description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information." defaultFor="1100"/> @@ -278,9 +279,11 @@ description is not currently required but encouraged. description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." /> <Option name="idempotency_id" code="504" paramType="String" paramDescription="Unique ID" - description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes." /> + description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use." + hidden="true" /> <Option name="automatic_idempotency" code="505" - description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future." /> + description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use." + hidden="true" /> <Option name="snapshot_ryw_enable" code="600" description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." /> <Option name="snapshot_ryw_disable" code="601" diff --git a/fdbrpc/AsyncFileEncrypted.actor.cpp b/fdbrpc/AsyncFileEncrypted.actor.cpp index ec37f79414..09edd14413 100644 --- a/fdbrpc/AsyncFileEncrypted.actor.cpp +++ b/fdbrpc/AsyncFileEncrypted.actor.cpp @@ -48,15 +48,17 @@ public: ACTOR static Future<Standalone<StringRef>> readBlock(AsyncFileEncrypted* self, uint32_t block) { state Arena arena; state unsigned char* encrypted = new (arena) unsigned char[FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE]; - int bytes = wait( - self->file->read(encrypted, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block)); + int bytes = wait(uncancellable(holdWhile(arena, + self->file->read(encrypted, + FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, + FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block)))); StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey(); DecryptionStreamCipher decryptor(cipherKey, self->getIV(block)); auto decrypted = decryptor.decrypt(encrypted, bytes, arena); return Standalone<StringRef>(decrypted, arena); } - ACTOR static Future<int> read(AsyncFileEncrypted* self, void* data, int length, int64_t offset) { + ACTOR static Future<int> read(Reference<AsyncFileEncrypted> self, void* data, int length, int64_t offset) { state const uint32_t firstBlock = offset / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE; state const uint32_t lastBlock = (offset + length - 1) / FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE; state uint32_t block; @@ -70,7 +72,7 @@ public: if (cachedBlock.present()) { plaintext = cachedBlock.get(); } else { - wait(store(plaintext, readBlock(self, block))); + wait(store(plaintext, readBlock(self.getPtr(), block))); self->readBuffers.insert(block, plaintext); } auto start = (block == firstBlock) ? plaintext.begin() + (offset % FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE) @@ -96,7 +98,7 @@ public: return bytesRead; } - ACTOR static Future<Void> write(AsyncFileEncrypted* self, void const* data, int length, int64_t offset) { + ACTOR static Future<Void> write(Reference<AsyncFileEncrypted> self, void const* data, int length, int64_t offset) { ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY); // All writes must append to the end of the file: ASSERT_EQ(offset, self->currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE + self->offsetInBlock); @@ -122,7 +124,7 @@ public: return Void(); } - ACTOR static Future<Void> sync(AsyncFileEncrypted* self) { + ACTOR static Future<Void> sync(Reference<AsyncFileEncrypted> self) { ASSERT(self->mode == AsyncFileEncrypted::Mode::APPEND_ONLY); wait(self->writeLastBlockToFile()); wait(self->file->sync()); @@ -135,7 +137,7 @@ public: Arena arena; auto zeroes = new (arena) unsigned char[length]; memset(zeroes, 0, length); - wait(self->write(zeroes, length, offset)); + wait(uncancellable(holdWhile(arena, self->write(zeroes, length, offset)))); return Void(); } }; @@ -159,11 +161,11 @@ void AsyncFileEncrypted::delref() { } Future<int> AsyncFileEncrypted::read(void* data, int length, int64_t offset) { - return AsyncFileEncryptedImpl::read(this, data, length, offset); + return AsyncFileEncryptedImpl::read(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset); } Future<Void> AsyncFileEncrypted::write(void const* data, int length, int64_t offset) { - return AsyncFileEncryptedImpl::write(this, data, length, offset); + return AsyncFileEncryptedImpl::write(Reference<AsyncFileEncrypted>::addRef(this), data, length, offset); } Future<Void> AsyncFileEncrypted::zeroRange(int64_t offset, int64_t length) { @@ -177,7 +179,7 @@ Future<Void> AsyncFileEncrypted::truncate(int64_t size) { Future<Void> AsyncFileEncrypted::sync() { ASSERT(mode == Mode::APPEND_ONLY); - return AsyncFileEncryptedImpl::sync(this); + return AsyncFileEncryptedImpl::sync(Reference<AsyncFileEncrypted>::addRef(this)); } Future<Void> AsyncFileEncrypted::flush() { @@ -217,7 +219,11 @@ StreamCipher::IV AsyncFileEncrypted::getIV(uint32_t block) const { } Future<Void> AsyncFileEncrypted::writeLastBlockToFile() { - return file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE); + // The source buffer for the write is owned by *this so this must be kept alive by reference count until the write + // is finished. + return uncancellable( + holdWhile(Reference<AsyncFileEncrypted>::addRef(this), + file->write(&writeBuffer[0], offsetInBlock, currentBlock * FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE))); } size_t AsyncFileEncrypted::RandomCache::evict() { diff --git a/fdbrpc/include/fdbrpc/AsyncFileChaos.h b/fdbrpc/include/fdbrpc/AsyncFileChaos.h index 165b297bbf..3853b55ae8 100644 --- a/fdbrpc/include/fdbrpc/AsyncFileChaos.h +++ b/fdbrpc/include/fdbrpc/AsyncFileChaos.h @@ -71,8 +71,9 @@ public: // Wait for diskDelay before submitting the I/O // Template types are being provided explicitly because they can't be automatically deduced for some reason. + // Capture file by value in case this is destroyed during the delay return mapAsync<Void, std::function<Future<int>(Void)>, int>( - delay(diskDelay), [=](Void _) -> Future<int> { return file->read(data, length, offset); }); + delay(diskDelay), [=, file = file](Void _) -> Future<int> { return file->read(data, length, offset); }); } Future<Void> write(void const* data, int length, int64_t offset) override { @@ -111,12 +112,14 @@ public: } // Wait for diskDelay before submitting the I/O - return mapAsync<Void, std::function<Future<Void>(Void)>, Void>(delay(diskDelay), [=](Void _) -> Future<Void> { - if (pdata) - return holdWhile(arena, file->write(pdata, length, offset)); + // Capture file by value in case this is destroyed during the delay + return mapAsync<Void, std::function<Future<Void>(Void)>, Void>( + delay(diskDelay), [=, file = file](Void _) -> Future<Void> { + if (pdata) + return holdWhile(arena, file->write(pdata, length, offset)); - return file->write(data, length, offset); - }); + return file->write(data, length, offset); + }); } Future<Void> truncate(int64_t size) override { @@ -125,8 +128,9 @@ public: return file->truncate(size); // Wait for diskDelay before submitting the I/O + // Capture file by value in case this is destroyed during the delay return mapAsync<Void, std::function<Future<Void>(Void)>, Void>( - delay(diskDelay), [=](Void _) -> Future<Void> { return file->truncate(size); }); + delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->truncate(size); }); } Future<Void> sync() override { @@ -135,8 +139,9 @@ public: return file->sync(); // Wait for diskDelay before submitting the I/O + // Capture file by value in case this is destroyed during the delay return mapAsync<Void, std::function<Future<Void>(Void)>, Void>( - delay(diskDelay), [=](Void _) -> Future<Void> { return file->sync(); }); + delay(diskDelay), [=, file = file](Void _) -> Future<Void> { return file->sync(); }); } Future<int64_t> size() const override { @@ -145,8 +150,9 @@ public: return file->size(); // Wait for diskDelay before submitting the I/O + // Capture file by value in case this is destroyed during the delay return mapAsync<Void, std::function<Future<int64_t>(Void)>, int64_t>( - delay(diskDelay), [=](Void _) -> Future<int64_t> { return file->size(); }); + delay(diskDelay), [=, file = file](Void _) -> Future<int64_t> { return file->size(); }); } int64_t debugFD() const override { return file->debugFD(); } diff --git a/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h index f4f883d6c1..01fc71adfd 100644 --- a/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/include/fdbrpc/AsyncFileNonDurable.actor.h @@ -46,12 +46,17 @@ ACTOR Future<Void> sendErrorOnProcess(ISimulator::ProcessInfo* process, TaskPriority taskID); ACTOR template <class T> -Future<T> sendErrorOnShutdown(Future<T> in) { - choose { - when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) { - throw io_error().asInjectedFault(); +Future<T> sendErrorOnShutdown(Future<T> in, bool assertOnCancel = false) { + try { + choose { + when(wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture()))) { + throw io_error().asInjectedFault(); + } + when(T rep = wait(in)) { return rep; } } - when(T rep = wait(in)) { return rep; } + } catch (Error& e) { + ASSERT(e.code() != error_code_actor_cancelled || !assertOnCancel); + throw; } } @@ -59,9 +64,12 @@ class AsyncFileDetachable final : public IAsyncFile, public ReferenceCounted<Asy private: Reference<IAsyncFile> file; Future<Void> shutdown; + bool assertOnReadWriteCancel; public: - explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file) { shutdown = doShutdown(this); } + explicit AsyncFileDetachable(Reference<IAsyncFile> file) : file(file), assertOnReadWriteCancel(true) { + shutdown = doShutdown(this); + } ACTOR Future<Void> doShutdown(AsyncFileDetachable* self) { wait(success(g_simulator->getCurrentProcess()->shutdownSignal.getFuture())); @@ -84,13 +92,13 @@ public: Future<int> read(void* data, int length, int64_t offset) override { if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady()) return io_error().asInjectedFault(); - return sendErrorOnShutdown(file->read(data, length, offset)); + return sendErrorOnShutdown(file->read(data, length, offset), assertOnReadWriteCancel); } Future<Void> write(void const* data, int length, int64_t offset) override { if (!file.getPtr() || g_simulator->getCurrentProcess()->shutdownSignal.getFuture().isReady()) return io_error().asInjectedFault(); - return sendErrorOnShutdown(file->write(data, length, offset)); + return sendErrorOnShutdown(file->write(data, length, offset), assertOnReadWriteCancel); } Future<Void> truncate(int64_t size) override { diff --git a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h index 8b0536c0d5..144cfcf1f3 100644 --- a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h +++ b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h @@ -52,7 +52,7 @@ public: state Reference<CacheBlock> block(new CacheBlock(length)); try { - int len = wait(f->m_f->read(block->data, length, offset)); + int len = wait(uncancellable(holdWhile(block, f->m_f->read(block->data, length, offset)))); block->len = len; } catch (Error& e) { f->m_max_concurrent_reads.release(1); diff --git a/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h b/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h index 4427e35796..b7251590f5 100644 --- a/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h +++ b/fdbrpc/include/fdbrpc/AsyncFileWriteChecker.h @@ -32,14 +32,18 @@ public: // For read() and write(), the data buffer must remain valid until the future is ready Future<int> read(void* data, int length, int64_t offset) override { - return map(m_f->read(data, length, offset), [=](int r) { - updateChecksumHistory(false, offset, r, (uint8_t*)data); + // Lambda must hold a reference to this to keep it alive until after the read + auto self = Reference<AsyncFileWriteChecker>::addRef(this); + return map(m_f->read(data, length, offset), [self, data, offset](int r) { + self->updateChecksumHistory(false, offset, r, (uint8_t*)data); return r; }); } Future<Void> readZeroCopy(void** data, int* length, int64_t offset) override { - return map(m_f->readZeroCopy(data, length, offset), [=](Void r) { - updateChecksumHistory(false, offset, *length, (uint8_t*)data); + // Lambda must hold a reference to this to keep it alive until after the read + auto self = Reference<AsyncFileWriteChecker>::addRef(this); + return map(m_f->readZeroCopy(data, length, offset), [self, data, length, offset](Void r) { + self->updateChecksumHistory(false, offset, *length, (uint8_t*)data); return r; }); } @@ -50,12 +54,14 @@ public: } Future<Void> truncate(int64_t size) override { - return map(m_f->truncate(size), [=](Void r) { + // Lambda must hold a reference to this to keep it alive until after the read + auto self = Reference<AsyncFileWriteChecker>::addRef(this); + return map(m_f->truncate(size), [self, size](Void r) { // Truncate the page checksum history if it is in use - if ((size / checksumHistoryPageSize) < checksumHistory.size()) { - int oldCapacity = checksumHistory.capacity(); - checksumHistory.resize(size / checksumHistoryPageSize); - checksumHistoryBudget.get() -= (checksumHistory.capacity() - oldCapacity); + if ((size / checksumHistoryPageSize) < self->checksumHistory.size()) { + int oldCapacity = self->checksumHistory.capacity(); + self->checksumHistory.resize(size / checksumHistoryPageSize); + checksumHistoryBudget.get() -= (self->checksumHistory.capacity() - oldCapacity); } return r; }); diff --git a/fdbrpc/include/fdbrpc/FlowTransport.h b/fdbrpc/include/fdbrpc/FlowTransport.h index af0a7a8b75..0f0c3a52e5 100644 --- a/fdbrpc/include/fdbrpc/FlowTransport.h +++ b/fdbrpc/include/fdbrpc/FlowTransport.h @@ -239,7 +239,7 @@ public: // Sets endpoint to be a new local endpoint which delivers messages to the given receiver void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID); - void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams); + void addEndpoints(std::vector<std::pair<class FlowReceiver*, TaskPriority>> const& streams); // The given local endpoint no longer delivers messages to the given receiver or uses resources void removeEndpoint(const Endpoint&, NetworkMessageReceiver*); diff --git a/fdbrpc/include/fdbrpc/fdbrpc.h b/fdbrpc/include/fdbrpc/fdbrpc.h index 63cf55c374..b0b4b39d4e 100644 --- a/fdbrpc/include/fdbrpc/fdbrpc.h +++ b/fdbrpc/include/fdbrpc/fdbrpc.h @@ -28,9 +28,14 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/networksender.actor.h" -struct FlowReceiver : public NetworkMessageReceiver { - // Common endpoint code for NetSAV<> and NetNotifiedQueue<> +// Common endpoint code for NetSAV<> and NetNotifiedQueue<> +class FlowReceiver : public NetworkMessageReceiver, public NonCopyable { + Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_; + Endpoint endpoint; + bool m_isLocalEndpoint; + bool m_stream; +protected: FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) {} FlowReceiver(Endpoint const& remoteEndpoint, bool stream) @@ -46,8 +51,17 @@ struct FlowReceiver : public NetworkMessageReceiver { } } - bool isLocalEndpoint() { return m_isLocalEndpoint; } - bool isRemoteEndpoint() { return endpoint.isValid() && !m_isLocalEndpoint; } +public: + bool isLocalEndpoint() const { return m_isLocalEndpoint; } + bool isRemoteEndpoint() const { return endpoint.isValid() && !m_isLocalEndpoint; } + + void setRemoteEndpoint(Endpoint const& remoteEndpoint, bool stream) { + ASSERT(!m_isLocalEndpoint); + ASSERT(!endpoint.isValid()); + endpoint = remoteEndpoint; + m_stream = stream; + FlowTransport::transport().addPeerReference(endpoint, m_stream); + } // If already a remote endpoint, returns that. Otherwise makes this // a local endpoint and returns that. @@ -80,12 +94,6 @@ struct FlowReceiver : public NetworkMessageReceiver { } const Endpoint& getRawEndpoint() { return endpoint; } - -private: - Optional<PeerCompatibilityPolicy> peerCompatibilityPolicy_; - Endpoint endpoint; - bool m_isLocalEndpoint; - bool m_stream; }; template <class T> @@ -363,8 +371,9 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>, this->sendError(message.getError()); } else { if (message.get().asUnderlyingType().acknowledgeToken.present()) { - acknowledgements = AcknowledgementReceiver( - FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get())); + acknowledgements.setRemoteEndpoint( + FlowTransport::transport().loadedEndpoint(message.get().asUnderlyingType().acknowledgeToken.get()), + false); if (onConnect.isValid() && onConnect.canBeSet()) { onConnect.send(Void()); } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 12e15ce92a..a73674b10e 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1240,6 +1240,7 @@ public: PromiseTask* task = self->taskQueue.getReadyTask(); self->taskQueue.popReadyTask(); self->execTask(*task); + delete task; self->yielded = false; } } @@ -2261,7 +2262,7 @@ public: } // Implementation - struct PromiseTask final { + struct PromiseTask final : public FastAllocated<PromiseTask> { Promise<Void> promise; ProcessInfo* machine; explicit PromiseTask(ProcessInfo* machine) : machine(machine) {} diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index ad6051b602..a571b52842 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -3537,7 +3537,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) { } // skip the rest of the algorithm for the first blob manager - if (bmData->epoch == 1) { + if (bmData->epoch == 1 && !isFullRestoreMode()) { bmData->doneRecovering.send(Void()); return Void(); } diff --git a/fdbserver/BlobManifest.actor.cpp b/fdbserver/BlobManifest.actor.cpp index 7e64130234..e85d774a67 100644 --- a/fdbserver/BlobManifest.actor.cpp +++ b/fdbserver/BlobManifest.actor.cpp @@ -26,6 +26,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbserver/Knobs.h" #include "flow/FastRef.h" +#include "flow/Trace.h" #include "flow/flow.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/BlobConnectionProvider.h" @@ -189,23 +190,6 @@ private: static const int sMaxCount_{ 5 }; // max number of manifest file to keep }; -// Defines granule info that interests full restore -struct BlobGranuleVersion { - // Two constructors required by VectorRef - BlobGranuleVersion() {} - BlobGranuleVersion(Arena& a, const BlobGranuleVersion& copyFrom) - : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version), - sizeInBytes(copyFrom.sizeInBytes) {} - - UID granuleID; - KeyRangeRef keyRange; - Version version; - int64_t sizeInBytes; -}; - -// Defines a vector for BlobGranuleVersion -typedef Standalone<VectorRef<BlobGranuleVersion>> BlobGranuleVersionVector; - // Defines filename, version, size for each granule file that interests full restore struct GranuleFileVersion { Version version; @@ -226,16 +210,53 @@ public: Value data = wait(readFromFile(self)); Standalone<BlobManifest> manifest = decode(data); wait(writeSystemKeys(self, manifest.rows)); - BlobGranuleVersionVector _ = wait(listGranules(self)); + BlobGranuleRestoreVersionVector _ = wait(listGranules(self)); } catch (Error& e) { dprint("WARNING: unexpected manifest loader error {}\n", e.what()); // skip error handling so far } return Void(); } + // Iterate active granules and return their version/sizes + ACTOR static Future<BlobGranuleRestoreVersionVector> listGranules(Reference<BlobManifestLoader> self) { + state Transaction tr(self->db_); + loop { + state BlobGranuleRestoreVersionVector results; + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + + try { + std::vector<KeyRangeRef> granules; + state int i = 0; + auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED; + state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit)); + for (i = 0; i < blobRanges.size() - 1; i++) { + Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin); + Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin); + state KeyRange granuleRange = KeyRangeRef(startKey, endKey); + try { + Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange)); + results.push_back_deep(results.arena(), granule); + } catch (Error& e) { + if (e.code() == error_code_restore_missing_data) { + dprint("missing data for key range {} \n", granuleRange.toString()); + TraceEvent("BlobRestoreMissingData").detail("KeyRange", granuleRange.toString()); + } else { + throw; + } + } + } + return results; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + // Print out a summary for blob granules ACTOR static Future<Void> print(Reference<BlobManifestLoader> self) { - state BlobGranuleVersionVector granules = wait(listGranules(self)); + state BlobGranuleRestoreVersionVector granules = wait(listGranules(self)); for (auto granule : granules) { wait(checkGranuleFiles(self, granule)); } @@ -285,41 +306,9 @@ private: } } - // Iterate active granules and return their version/sizes - ACTOR static Future<BlobGranuleVersionVector> listGranules(Reference<BlobManifestLoader> self) { - state Transaction tr(self->db_); - loop { - state BlobGranuleVersionVector results; - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - - try { - std::vector<KeyRangeRef> granules; - state int i = 0; - auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED; - state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit)); - for (i = 0; i < blobRanges.size() - 1; i++) { - Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin); - Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin); - state KeyRange granuleRange = KeyRangeRef(startKey, endKey); - try { - Standalone<BlobGranuleVersion> granule = wait(getGranule(&tr, granuleRange)); - results.push_back_deep(results.arena(), granule); - } catch (Error& e) { - dprint("missing data for key range {} \n", granuleRange.toString()); - } - } - return results; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - // Find the newest granule for a key range. The newest granule has the max version and relevant files - ACTOR static Future<Standalone<BlobGranuleVersion>> getGranule(Transaction* tr, KeyRangeRef range) { - state Standalone<BlobGranuleVersion> granuleVersion; + ACTOR static Future<Standalone<BlobGranuleRestoreVersion>> getGranule(Transaction* tr, KeyRangeRef range) { + state Standalone<BlobGranuleRestoreVersion> granuleVersion; KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range); // reverse lookup so that the first row is the newest version state RangeResult results = @@ -389,7 +378,7 @@ private: } // Read data from granules and print out summary - ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleVersion granule) { + ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleRestoreVersion granule) { state KeyRangeRef range = granule.keyRange; state Version readVersion = granule.version; state Transaction tr(self->db_); @@ -441,3 +430,11 @@ ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProv wait(BlobManifestLoader::print(loader)); return Void(); } + +// API to list blob granules +ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, + Reference<BlobConnectionProvider> blobConn) { + Reference<BlobManifestLoader> loader = makeReference<BlobManifestLoader>(db, blobConn); + BlobGranuleRestoreVersionVector result = wait(BlobManifestLoader::listGranules(loader)); + return result; +} diff --git a/fdbserver/BlobMigrator.actor.cpp b/fdbserver/BlobMigrator.actor.cpp index 2044b093d4..9be19fa6a4 100644 --- a/fdbserver/BlobMigrator.actor.cpp +++ b/fdbserver/BlobMigrator.actor.cpp @@ -30,54 +30,312 @@ #include "fdbclient/KeyRangeMap.h" #include "fdbclient/SystemData.h" #include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/ServerDBInfo.actor.h" #include "fdbserver/WaitFailure.h" - +#include "fdbserver/MoveKeys.actor.h" +#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "flow/actorcompiler.h" // has to be last include +#include "flow/network.h" +#include <algorithm> +#include <string> + +#define ENABLE_DEBUG_MG true + +template <typename... T> +static inline void dprint(fmt::format_string<T...> fmt, T&&... args) { + if (ENABLE_DEBUG_MG) + fmt::print(fmt, std::forward<T>(args)...); +} // BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of // StorageServerInterface APIs which are needed for DataDistributor to start data migration. class BlobMigrator : public NonCopyable, public ReferenceCounted<BlobMigrator> { public: BlobMigrator(Reference<AsyncVar<ServerDBInfo> const> dbInfo, BlobMigratorInterface interf) - : blobMigratorInterf(interf), actors(false) { - if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") { - blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL); + : interf_(interf), actors_(false) { + if (!blobConn_.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") { + blobConn_ = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL); } - db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); + db_ = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); } ~BlobMigrator() {} + // Start migration ACTOR static Future<Void> start(Reference<BlobMigrator> self) { - self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture())); + if (!isFullRestoreMode()) { + return Void(); + } + wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager + + BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_)); + self->blobGranules_ = granules; + + wait(prepare(self, normalKeys)); + + wait(serverLoop(self)); + return Void(); + } + +private: + // Prepare for data migration for given key range. + ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) { + // Register as a storage server, so that DataDistributor could start data movement after + std::pair<Version, Tag> verAndTag = wait(addStorageServer(self->db_, self->interf_.ssi)); + dprint("Started storage server interface {} {}\n", verAndTag.first, verAndTag.second.toString()); + + // Reassign key ranges to the storage server + // It'll restart DataDistributor so that internal data structures like ShardTracker, ShardsAffectedByTeamFailure + // could be re-initialized. Ideally it should be done within DataDistributor, then we don't need to + // restart DataDistributor + state int oldMode = wait(setDDMode(self->db_, 0)); + wait(unassignServerKeys(self, keys)); + wait(assignKeysToServer(self, keys, self->interf_.ssi.id())); + wait(success(setDDMode(self->db_, oldMode))); + return Void(); + } + + // Assign given key range to specified storage server. Subsquent + ACTOR static Future<Void> assignKeysToServer(Reference<BlobMigrator> self, KeyRangeRef keys, UID serverUID) { + state Transaction tr(self->db_); loop { - choose { - when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) { - req.reply.send(Void()); - TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID); - break; + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + try { + state Value value = keyServersValue(std::vector<UID>({ serverUID }), std::vector<UID>(), UID(), UID()); + wait(krmSetRange(&tr, keyServersPrefix, keys, value)); + wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue)); + wait(tr.commit()); + dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + // Unassign given key range from its current storage servers + ACTOR static Future<Void> unassignServerKeys(Reference<BlobMigrator> self, KeyRangeRef keys) { + state Transaction tr(self->db_); + loop { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + try { + state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY); + for (auto& server : serverList) { + state UID id = decodeServerListValue(server.value).id(); + RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), keys)); + bool owning = false; + for (auto& r : ranges) { + if (r.value == serverKeysTrue) { + owning = true; + break; + } + } + if (owning) { + dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString()); + wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse)); + } } - when(wait(self->actors.getResult())) {} + wait(tr.commit()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + // Main server loop + ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) { + self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture())); + self->actors_.add(handleRequest(self)); + self->actors_.add(handleUnsupportedRequest(self)); + loop { + try { + choose { + when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) { + req.reply.send(Void()); + TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID); + break; + } + when(wait(self->actors_.getResult())) {} + } + } catch (Error& e) { + dprint("Unexpected serverLoop error {}\n", e.what()); + throw; } } return Void(); } + // Handle StorageServerInterface APIs + ACTOR static Future<Void> handleRequest(Reference<BlobMigrator> self) { + state StorageServerInterface ssi = self->interf_.ssi; + loop { + try { + choose { + when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) { + dprint("Handle GetShardStateRequest\n"); + Version version = maxVersion(self); + GetShardStateReply rep(version, version); + req.reply.send(rep); // return empty shards + } + when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) { + // dprint("Handle WaitMetricsRequest\n"); + self->actors_.add(processWaitMetricsRequest(self, req)); + } + when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { + dprint("Handle SplitMetrics {}\n", req.keys.toString()); + SplitMetricsReply rep; + for (auto granule : self->blobGranules_) { + // TODO: Use granule boundary as split point. A better approach is to split by size + if (granule.keyRange.begin > req.keys.begin && granule.keyRange.end < req.keys.end) + rep.splits.push_back_deep(rep.splits.arena(), granule.keyRange.begin); + } + req.reply.send(rep); + } + when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { + fmt::print("Handle GetStorageMetrics\n"); + StorageMetrics metrics; + metrics.bytes = sizeInBytes(self); + GetStorageMetricsReply resp; + resp.load = metrics; + req.reply.send(resp); + } + when(ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture())) { + dprint("Handle KeyValueStoreType\n"); + reply.send(KeyValueStoreType::MEMORY); + } + } + } catch (Error& e) { + dprint("Unexpected blob migrator request error {}\n", e.what()); + throw; + } + } + } + + // Handle StorageServerInterface APIs that are not supported. Simply log and return error + ACTOR static Future<Void> handleUnsupportedRequest(Reference<BlobMigrator> self) { + state StorageServerInterface ssi = self->interf_.ssi; + loop { + try { + choose { + when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) { + dprint("Unsupported SplitRangeRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) { + self->actors_.add(processStorageQueuingMetricsRequest(req)); + } + when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) { + dprint("Unsupported ReadHotSubRange\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) { + dprint("Unsupported GetKeyValuesStreamRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { + dprint("Unsupported GetKeyRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture())) { + /* dprint("Unsupported GetKeyValuesRequest {} - {} @ {}\n", + req.begin.getKey().printable(), + req.end.getKey().printable(), + req.version); */ + req.reply.sendError(unsupported_operation()); + } + when(GetValueRequest req = waitNext(ssi.getValue.getFuture())) { + dprint("Unsupported GetValueRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) { + dprint("Unsupported GetCheckpoint \n"); + req.reply.sendError(unsupported_operation()); + } + when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) { + dprint("Unsupported FetchCheckpointRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) { + dprint("Unsupported UpdateCommitCostRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) { + dprint("Unsupported FetchCheckpointKeyValuesRequest\n"); + req.reply.sendError(unsupported_operation()); + } + } + } catch (Error& e) { + dprint("Unexpected request handling error {}\n", e.what()); + throw; + } + } + } + + ACTOR static Future<Void> processWaitMetricsRequest(Reference<BlobMigrator> self, WaitMetricsRequest req) { + state WaitMetricsRequest waitMetricsRequest = req; + // FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD + // processes + wait(delay(1)); + StorageMetrics metrics; + metrics.bytes = sizeInBytes(self, waitMetricsRequest.keys); + waitMetricsRequest.reply.send(metrics); + return Void(); + } + + ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) { + dprint("Unsupported StorageQueuingMetricsRequest\n"); + // FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD + // processes + wait(delay(1)); + req.reply.sendError(unsupported_operation()); + return Void(); + } + + // Return total storage size in bytes for migration + static int64_t sizeInBytes(Reference<BlobMigrator> self) { return sizeInBytes(self, normalKeys); } + + // Return storage size in bytes for given key range + static int64_t sizeInBytes(Reference<BlobMigrator> self, KeyRangeRef range) { + int64_t bytes = 0; + for (auto granule : self->blobGranules_) { + if (range.intersects(granule.keyRange)) + bytes += granule.sizeInBytes; + } + return bytes; + } + + // Return max version for all blob granules + static Version maxVersion(Reference<BlobMigrator> self) { + Version max = 0; + for (auto granule : self->blobGranules_) { + max = std::max(granule.version, max); + } + return max; + } + private: - Database db; - Reference<BlobConnectionProvider> blobConn; - BlobMigratorInterface blobMigratorInterf; - ActorCollection actors; + Database db_; + Reference<BlobConnectionProvider> blobConn_; + BlobGranuleRestoreVersionVector blobGranules_; + BlobMigratorInterface interf_; + ActorCollection actors_; }; // Main entry point -ACTOR Future<Void> blobMigrator(BlobMigratorInterface ssi, Reference<AsyncVar<ServerDBInfo> const> dbInfo) { - fmt::print("Start blob migrator {} \n", ssi.id().toString()); +ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) { + fmt::print("Start blob migrator {} \n", interf.id().toString()); try { - Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, ssi); + Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf); wait(BlobMigrator::start(self)); } catch (Error& e) { - fmt::print("unexpected blob migrator error {}\n", e.what()); + dprint("Unexpected blob migrator error {}\n", e.what()); + TraceEvent("BlobMigratorError", interf.id()).error(e); } return Void(); } diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index ed2dbcf5da..5717472890 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -3960,7 +3960,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As } } - if (createChangeFeed) { + if (createChangeFeed && !isFullRestoreMode()) { // create new change feed for new version of granule wait(updateChangeFeed( &tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange)); diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index c962ca891d..e59eddd8cf 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -2615,8 +2615,9 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) { } loop { if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) { - state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure, - SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME); + state Future<Void> wfClient = + waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure, + SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME); loop { choose { when(wait(wfClient)) { diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index 46bb435145..98fa27689d 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -687,6 +687,20 @@ struct DDQueue : public IDDRelocationQueue { Reference<EventCacheHolder> movedKeyServersEventHolder; + int moveReusePhysicalShard; + int moveCreateNewPhysicalShard; + enum RetryFindDstReason { + None = 0, + RemoteBestTeamNotReady, + PrimaryNoHealthyTeam, + RemoteNoHealthyTeam, + RemoteTeamIsFull, + RemoteTeamIsNotHealthy, + NoAvailablePhysicalShard, + NumberOfTypes, + }; + std::vector<int> retryFindDstReasonCount; + void startRelocation(int priority, int healthPriority) { // Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement, // we must count it into unhealthyRelocations; because team removers relies on unhealthyRelocations to @@ -750,7 +764,9 @@ struct DDQueue : public IDDRelocationQueue { output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), lastInterval(0), suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)), rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0), - movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")) {} + movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0), + moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) { + } DDQueue() = default; void validate() { @@ -1463,6 +1479,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self, loop { destOverloadedCount = 0; stuckCount = 0; + state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None; // state int bestTeamStuckThreshold = 50; loop { state int tciIndex = 0; @@ -1489,10 +1506,13 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self, .detail("TeamCollectionIndex", tciIndex) .detail("RestoreDataMoveForDest", describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest)); + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady; foundTeams = false; break; } if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) { + retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam + : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam; foundTeams = false; break; } @@ -1545,12 +1565,15 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self, // getting the destination team or we could miss failure notifications for the storage // servers in the destination team TraceEvent("BestTeamNotReady"); + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady; foundTeams = false; break; } // If a DC has no healthy team, we stop checking the other DCs until // the unhealthy DC is healthy again or is excluded. if (!bestTeam.first.present()) { + retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam + : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam; foundTeams = false; break; } @@ -1574,6 +1597,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self, if (tciIndex == 1 && !forceToUseNewPhysicalShard) { bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true); if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) { + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull; foundTeams = false; break; } @@ -1616,6 +1640,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self, if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD && bestTeams.size() > 1 && !forceToUseNewPhysicalShard) { if (!bestTeams[1].first->isHealthy()) { + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy; foundTeams = false; } } @@ -1676,6 +1701,19 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self, // when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate // thus, update the physicalShardIDCandidate to related data structures ASSERT(physicalShardIDCandidate != UID().first()); + if (self->physicalShardCollection->physicalShardExists(physicalShardIDCandidate)) { + self->moveReusePhysicalShard++; + } else { + self->moveCreateNewPhysicalShard++; + if (retryFindDstReason == DDQueue::RetryFindDstReason::None) { + // When creating a new physical shard, but the reason is none, this can only happen when + // determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical + // shard. + self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++; + } else { + self->retryFindDstReasonCount[retryFindDstReason]++; + } + } rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False); auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin); inFlightRange.value().dataMoveId = rd.dataMoveId; @@ -2472,6 +2510,30 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db, .trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by // DataDistributor::movingDataEventHolder. The track latest // key we use here must match the key used in the holder. + + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { + TraceEvent("PhysicalShardMoveStats") + .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard) + .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard) + .detail("RemoteBestTeamNotReady", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady]) + .detail("PrimaryNoHealthyTeam", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam]) + .detail("RemoteNoHealthyTeam", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam]) + .detail("RemoteTeamIsFull", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull]) + .detail("RemoteTeamIsNotHealthy", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy]) + .detail( + "NoAvailablePhysicalShard", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]); + self.moveCreateNewPhysicalShard = 0; + self.moveReusePhysicalShard = 0; + for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) { + self.retryFindDstReasonCount[i] = 0; + } + } } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(ddQueueFutures))) {} diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp index 5c17bc1ab5..be7343ba4c 100644 --- a/fdbserver/DDShardTracker.actor.cpp +++ b/fdbserver/DDShardTracker.actor.cpp @@ -2081,6 +2081,10 @@ void PhysicalShardCollection::logPhysicalShardCollection() { } } +bool PhysicalShardCollection::physicalShardExists(uint64_t physicalShardID) { + return physicalShardInstances.find(physicalShardID) != physicalShardInstances.end(); +} + // FIXME: complete this test with non-empty range TEST_CASE("/DataDistributor/Tracker/FetchTopK") { state DataDistributionTracker self; diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index d295df1ee5..2caaf15832 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -286,8 +286,6 @@ public: PromiseStream<RelocateShard> relocationProducer, relocationConsumer; Reference<PhysicalShardCollection> physicalShardCollection; - StorageQuotaInfo storageQuotaInfo; - Promise<Void> initialized; std::unordered_map<AuditType, std::vector<std::shared_ptr<DDAudit>>> audits; @@ -542,27 +540,6 @@ public: } }; -ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) { - loop { - state Transaction tr(cx); - loop { - try { - state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); - TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size()); - for (auto const kv : currentQuotas) { - Key const key = kv.key.removePrefix(storageQuotaPrefix); - uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned()); - storageQuotaInfo->quotaMap[key] = quota; - } - wait(delay(5.0)); - break; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } -} - // Periodically check and log the physicalShard status; clean up empty physicalShard; ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) { ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA); @@ -683,16 +660,19 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self, self->ddId, &normalDDQueueErrors())); - actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo), - "StorageQuotaTracker", - self->ddId, - &normalDDQueueErrors())); - if (ddIsTenantAware) { actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorTenantMap(), "DDTenantCacheMonitor", self->ddId, &normalDDQueueErrors())); + actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageQuota(), + "StorageQuotaTracker", + self->ddId, + &normalDDQueueErrors())); + actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(), + "StorageUsageTracker", + self->ddId, + &normalDDQueueErrors())); } std::vector<DDTeamCollection*> teamCollectionsPtrs; diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index 40342945b6..c7f1ecdd08 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -429,7 +429,7 @@ public: waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos)); self->writingPos += pageData.size(); - return waitForAll(waitfor); + return waitForAllReadyThenThrow(waitfor); } // Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem) @@ -655,7 +655,7 @@ public: for (int i = 0; i < 2; i++) if (self->files[i].size > 0) reads.push_back(self->files[i].f->read(self->firstPages[i], sizeof(Page), 0)); - wait(waitForAll(reads)); + wait(waitForAllReadyThenThrow(reads)); // Determine which file comes first if (compare(self->firstPages[1], self->firstPages[0])) { @@ -743,7 +743,10 @@ public: } // Read nPages from pageOffset*sizeof(Page) offset in file self->files[file] - ACTOR static Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self, int file, int pageOffset, int nPages) { + ACTOR static UNCANCELLABLE Future<Standalone<StringRef>> read(RawDiskQueue_TwoFiles* self, + int file, + int pageOffset, + int nPages) { state TrackMe trackMe(self); state const size_t bytesRequested = nPages * sizeof(Page); state Standalone<StringRef> result = makeAlignedString(sizeof(Page), bytesRequested); diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp index 9556c86fdc..095f3fb386 100644 --- a/fdbserver/EncryptKeyProxy.actor.cpp +++ b/fdbserver/EncryptKeyProxy.actor.cpp @@ -388,6 +388,15 @@ ACTOR Future<Void> getCipherKeysByBaseCipherKeyIds(Reference<EncryptKeyProxyData try { KmsConnLookupEKsByKeyIdsReq keysByIdsReq; for (const auto& item : lookupCipherInfoMap) { + // TODO: Currently getEncryptCipherKeys does not pass the domain name, once that is fixed we can remove + // the check on the empty domain name + if (!item.second.domainName.empty()) { + if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { + ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) { + ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); + } + } keysByIdsReq.encryptKeyInfos.emplace_back_deep( keysByIdsReq.arena, item.second.domainId, item.second.baseCipherId, item.second.domainName); } @@ -527,6 +536,11 @@ ACTOR Future<Void> getLatestCipherKeys(Reference<EncryptKeyProxyData> ekpProxyDa try { KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq; for (const auto& item : lookupCipherDomains) { + if (item.second.domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { + ASSERT(item.second.domainName == FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } else if (item.second.domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) { + ASSERT(item.second.domainName == FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); + } keysByDomainIdReq.encryptDomainInfos.emplace_back_deep( keysByDomainIdReq.arena, item.second.domainId, item.second.domainName); } diff --git a/fdbserver/KeyValueStoreCompressTestData.actor.cpp b/fdbserver/KeyValueStoreCompressTestData.actor.cpp index a5098baf4e..7aa99b21ba 100644 --- a/fdbserver/KeyValueStoreCompressTestData.actor.cpp +++ b/fdbserver/KeyValueStoreCompressTestData.actor.cpp @@ -53,7 +53,11 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore { void set(KeyValueRef keyValue, const Arena* arena = nullptr) override { store->set(KeyValueRef(keyValue.key, pack(keyValue.value)), arena); } - void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); } + void clear(KeyRangeRef range, + const StorageServerMetrics* storageMetrics = nullptr, + const Arena* arena = nullptr) override { + store->clear(range, storageMetrics, arena); + } Future<Void> commit(bool sequential = false) override { return store->commit(sequential); } Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override { diff --git a/fdbserver/KeyValueStoreMemory.actor.cpp b/fdbserver/KeyValueStoreMemory.actor.cpp index e055bab003..73478aaa41 100644 --- a/fdbserver/KeyValueStoreMemory.actor.cpp +++ b/fdbserver/KeyValueStoreMemory.actor.cpp @@ -130,7 +130,7 @@ public: } } - void clear(KeyRangeRef range, const Arena* arena) override { + void clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) override { // A commit that occurs with no available space returns Never, so we can throw out all modifications if (getAvailableSize() <= 0) return; diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index 911c074f06..19470b9877 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -1846,22 +1846,52 @@ struct RocksDBKeyValueStore : IKeyValueStore { void set(KeyValueRef kv, const Arena*) override { if (writeBatch == nullptr) { writeBatch.reset(new rocksdb::WriteBatch()); + keysSet.clear(); } ASSERT(defaultFdbCF != nullptr); writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value)); + if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE) { + keysSet.insert(kv.key); + } } - void clear(KeyRangeRef keyRange, const Arena*) override { + void clear(KeyRangeRef keyRange, const StorageServerMetrics* storageMetrics, const Arena*) override { if (writeBatch == nullptr) { writeBatch.reset(new rocksdb::WriteBatch()); + keysSet.clear(); } ASSERT(defaultFdbCF != nullptr); - if (keyRange.singleKeyRange()) { writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin)); } else { - writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end)); + if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr && + storageMetrics->byteSample.getEstimate(keyRange) < + SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) { + rocksdb::ReadOptions options = sharedState->getReadOptions(); + auto beginSlice = toSlice(keyRange.begin); + auto endSlice = toSlice(keyRange.end); + options.iterate_lower_bound = &beginSlice; + options.iterate_upper_bound = &endSlice; + auto cursor = std::unique_ptr<rocksdb::Iterator>(db->NewIterator(options, defaultFdbCF)); + cursor->Seek(toSlice(keyRange.begin)); + while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) { + writeBatch->Delete(defaultFdbCF, cursor->key()); + cursor->Next(); + } + if (!cursor->status().ok()) { + // if readrange iteration fails, then do a deleteRange. + writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end)); + } else { + auto it = keysSet.lower_bound(keyRange.begin); + while (it != keysSet.end() && *it < keyRange.end) { + writeBatch->Delete(defaultFdbCF, toSlice(*it)); + it++; + } + } + } else { + writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end)); + } } } @@ -1890,6 +1920,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { } auto a = new Writer::CommitAction(); a->batchToCommit = std::move(writeBatch); + keysSet.clear(); auto res = a->done.getFuture(); writeThread->post(a); return res; @@ -2083,6 +2114,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { Promise<Void> closePromise; Future<Void> openFuture; std::unique_ptr<rocksdb::WriteBatch> writeBatch; + std::set<Key> keysSet; Optional<Future<Void>> metrics; FlowLock readSemaphore; int numReadWaiters; diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index 634beb190c..1c95c64e8a 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -1596,7 +1596,9 @@ public: StorageBytes getStorageBytes() const override; void set(KeyValueRef keyValue, const Arena* arena = nullptr) override; - void clear(KeyRangeRef range, const Arena* arena = nullptr) override; + void clear(KeyRangeRef range, + const StorageServerMetrics* storageMetrics = nullptr, + const Arena* arena = nullptr) override; Future<Void> commit(bool sequential = false) override; Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> optionss) override; @@ -2215,7 +2217,7 @@ void KeyValueStoreSQLite::set(KeyValueRef keyValue, const Arena* arena) { ++writesRequested; writeThread->post(new Writer::SetAction(keyValue)); } -void KeyValueStoreSQLite::clear(KeyRangeRef range, const Arena* arena) { +void KeyValueStoreSQLite::clear(KeyRangeRef range, const StorageServerMetrics* storageMetrics, const Arena* arena) { ++writesRequested; writeThread->post(new Writer::ClearAction(range)); } diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp index 64a65dec48..9ac6984f90 100644 --- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp @@ -49,6 +49,7 @@ static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); const std::string rocksDataFolderSuffix = "-data"; +const std::string METADATA_SHARD_ID = "kvs-metadata"; const KeyRef shardMappingPrefix("\xff\xff/ShardMapping/"_sr); // TODO: move constants to a header file. const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = "RocksDBStorage"_sr; @@ -304,13 +305,12 @@ rocksdb::ReadOptions getReadOptions() { } struct ReadIterator { - rocksdb::ColumnFamilyHandle* cf; uint64_t index; // incrementing counter to uniquely identify read iterator. bool inUse; std::shared_ptr<rocksdb::Iterator> iter; double creationTime; ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options) - : cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {} + : index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {} }; /* @@ -475,13 +475,26 @@ struct PhysicalShard { } ~PhysicalShard() { - if (!deletePending) - return; + logShardEvent(id, ShardOp::CLOSE); + isInitialized.store(false); + readIterPool.reset(); - // Destroy CF - auto s = db->DropColumnFamily(cf); + // Deleting default column family is not allowed. + if (id == "default") { + return; + } + + if (deletePending) { + auto s = db->DropColumnFamily(cf); + if (!s.ok()) { + logRocksDBError(s, "DestroyShard"); + logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString()); + return; + } + } + auto s = db->DestroyColumnFamilyHandle(cf); if (!s.ok()) { - logRocksDBError(s, "DestroyShard"); + logRocksDBError(s, "DestroyCFHandle"); logShardEvent(id, ShardOp::DESTROY, SevError, s.ToString()); return; } @@ -628,7 +641,7 @@ public: std::vector<rocksdb::ColumnFamilyDescriptor> descriptors; bool foundMetadata = false; for (const auto& name : columnFamilies) { - if (name == "kvs-metadata") { + if (name == METADATA_SHARD_ID) { foundMetadata = true; } descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions }); @@ -652,19 +665,19 @@ public: TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId) .detail("PhysicalShardCount", handles.size()); + std::shared_ptr<PhysicalShard> metadataShard = nullptr; for (auto handle : handles) { - if (handle->GetName() == "kvs-metadata") { - metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle); - } else { - physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle); + auto shard = std::make_shared<PhysicalShard>(db, handle->GetName(), handle); + if (shard->id == METADATA_SHARD_ID) { + metadataShard = shard; } + physicalShards[shard->id] = shard; columnFamilyMap[handle->GetID()] = handle; - TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId) - .detail("PhysicalShard", handle->GetName()); + TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId).detail("PhysicalShard", shard->id); } std::set<std::string> unusedShards(columnFamilies.begin(), columnFamilies.end()); - unusedShards.erase("kvs-metadata"); + unusedShards.erase(METADATA_SHARD_ID); unusedShards.erase("default"); KeyRange keyRange = prefixRange(shardMappingPrefix); @@ -746,9 +759,11 @@ public: defaultShard->dataShards[specialKeys.begin.toString()] = std::move(dataShard); physicalShards[defaultShard->id] = defaultShard; - metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata"); + // Create metadata shard. + auto metadataShard = std::make_shared<PhysicalShard>(db, METADATA_SHARD_ID); metadataShard->init(); columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf; + physicalShards[METADATA_SHARD_ID] = metadataShard; // Write special key range metadata. writeBatch = std::make_unique<rocksdb::WriteBatch>(); @@ -763,7 +778,6 @@ public: TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId) .detail("MetadataShardCF", metadataShard->cf->GetID()); } - physicalShards["kvs-metadata"] = metadataShard; writeBatch = std::make_unique<rocksdb::WriteBatch>(); dirtyShards = std::make_unique<std::set<PhysicalShard*>>(); @@ -910,6 +924,9 @@ public: std::vector<std::shared_ptr<PhysicalShard>> getPendingDeletionShards(double cleanUpDelay) { std::vector<std::shared_ptr<PhysicalShard>> emptyShards; double currentTime = now(); + + TraceEvent(SevInfo, "ShardedRocksDB", logId) + .detail("PendingDeletionShardQueueSize", pendingDeletionShards.size()); while (!pendingDeletionShards.empty()) { const auto& id = pendingDeletionShards.front(); auto it = physicalShards.find(id); @@ -976,6 +993,10 @@ public: .detail("Info", "RangeToPersist") .detail("BeginKey", range.begin) .detail("EndKey", range.end); + auto it = physicalShards.find(METADATA_SHARD_ID); + ASSERT(it != physicalShards.end()); + auto metadataShard = it->second; + writeBatch->DeleteRange(metadataShard->cf, getShardMappingKey(range.begin, shardMappingPrefix), getShardMappingKey(range.end, shardMappingPrefix)); @@ -1043,24 +1064,30 @@ public: } void closeAllShards() { - for (auto& [_, shard] : physicalShards) { - shard->readIterPool.reset(); - } + columnFamilyMap.clear(); + physicalShards.clear(); // Close DB. auto s = db->Close(); if (!s.ok()) { logRocksDBError(s, "Close"); return; } + TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBClosed"); } void destroyAllShards() { - closeAllShards(); - std::vector<rocksdb::ColumnFamilyDescriptor> cfs; - for (const auto& [key, _] : physicalShards) { - cfs.push_back(rocksdb::ColumnFamilyDescriptor{ key, getCFOptions() }); + columnFamilyMap.clear(); + for (auto& [_, shard] : physicalShards) { + shard->deletePending = true; } - auto s = rocksdb::DestroyDB(path, getOptions(), cfs); + physicalShards.clear(); + // Close DB. + auto s = db->Close(); + if (!s.ok()) { + logRocksDBError(s, "Close"); + return; + } + s = rocksdb::DestroyDB(path, getOptions()); if (!s.ok()) { logRocksDBError(s, "DestroyDB"); } @@ -1121,7 +1148,6 @@ private: std::unique_ptr<rocksdb::WriteBatch> writeBatch; std::unique_ptr<std::set<PhysicalShard*>> dirtyShards; KeyRangeMap<DataShard*> dataShardMap; - std::shared_ptr<PhysicalShard> metadataShard = nullptr; std::deque<std::string> pendingDeletionShards; }; @@ -2240,6 +2266,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { // TODO: Adapt the simulation framework to not advance time quickly when background reads/writes are // occurring. if (g_network->isSimulated()) { + TraceEvent(SevDebug, "ShardedRocksDB").detail("Info", "Use Coro threads in simulation."); writeThread = CoroThreadPool::createThreadPool(); readThreads = CoroThreadPool::createThreadPool(); } else { @@ -2316,7 +2343,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { void set(KeyValueRef kv, const Arena*) override { shardManager.put(kv.key, kv.value); } - void clear(KeyRangeRef range, const Arena*) override { + void clear(KeyRangeRef range, const StorageServerMetrics*, const Arena*) override { if (range.singleKeyRange()) { shardManager.clear(range.begin); } else { diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 964045c153..1786e055b9 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -405,10 +405,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader( .detail("Offset", asset.offset) .detail("Length", asset.len); - // Ensure data blocks in the same file are processed in order - wait(processedFileOffset->whenAtLeast(asset.offset)); - ASSERT(processedFileOffset->get() == asset.offset); - state Arena tempArena; state StringRefReader reader(buf, restore_corrupted_data()); try { @@ -430,8 +426,9 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader( const uint8_t* message = reader.consume(msgSize); // Skip mutations out of the version range - if (!asset.isInVersionRange(msgVersion.version)) + if (!asset.isInVersionRange(msgVersion.version)) { continue; + } state VersionedMutationsMap::iterator it; bool inserted; @@ -452,6 +449,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader( // Skip mutation whose commitVesion < range kv's version if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) { cc->oldLogMutations += 1; + wait(yield()); // avoid potential stack overflows continue; } @@ -459,6 +457,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader( if (mutation.param1 >= asset.range.end || (isRangeMutation(mutation) && mutation.param2 < asset.range.begin) || (!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) { + wait(yield()); // avoid potential stack overflows continue; } @@ -509,7 +508,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader( .detail("BlockLen", asset.len); throw; } - processedFileOffset->set(asset.offset + asset.len); return Void(); } @@ -526,8 +524,19 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader( state int readFileRetries = 0; loop { try { + // Ensure data blocks in the same file are processed in order + wait(processedFileOffset->whenAtLeast(asset.offset)); + ASSERT(processedFileOffset->get() == asset.offset); + wait(_parsePartitionedLogFileOnLoader( pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx)); + processedFileOffset->set(asset.offset + asset.len); + + TraceEvent("FastRestoreLoaderDecodingLogFileDone") + .detail("BatchIndex", asset.batchIndex) + .detail("Filename", asset.filename) + .detail("Offset", asset.offset) + .detail("Length", asset.len); break; } catch (Error& e) { if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version || diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp index 1edd322490..2d57953f2a 100644 --- a/fdbserver/RocksDBCheckpointUtils.actor.cpp +++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp @@ -529,6 +529,7 @@ ACTOR Future<Void> fetchCheckpointFile(Database cx, state int64_t offset = 0; state Reference<IAsyncFile> asyncFile; loop { + offset = 0; try { asyncFile = Reference<IAsyncFile>(); ++attempt; @@ -559,7 +560,8 @@ ACTOR Future<Void> fetchCheckpointFile(Database cx, offset += rep.data.size(); } } catch (Error& e) { - if (e.code() != error_code_end_of_stream) { + if (e.code() != error_code_end_of_stream || + (g_network->isSimulated() && attempt == 1 && deterministicRandom()->coinflip())) { TraceEvent("FetchCheckpointFileError") .errorUnsuppressed(e) .detail("RemoteFile", remoteFile) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 07bc48132a..783478b7b6 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -107,7 +107,8 @@ bool destructed = false; class TestConfig : public BasicTestConfig { class ConfigBuilder { using value_type = toml::basic_value<toml::discard_comments>; - using base_variant = std::variant<int, float, double, bool, std::string, std::vector<int>, ConfigDBType>; + using base_variant = std:: + variant<int, float, double, bool, std::string, std::vector<int>, std::vector<std::string>, ConfigDBType>; using types = variant_map<variant_concat<base_variant, variant_map<base_variant, Optional>>, std::add_pointer_t>; std::unordered_map<std::string_view, types> confMap; @@ -148,6 +149,17 @@ class TestConfig : public BasicTestConfig { (*this)(&res); *val = std::move(res); } + void operator()(std::vector<std::string>* val) const { + auto arr = value.as_array(); + for (const auto& i : arr) { + val->emplace_back(i.as_string()); + } + } + void operator()(Optional<std::vector<std::string>>* val) const { + std::vector<std::string> res; + (*this)(&res); + *val = std::move(res); + } }; struct trace_visitor { @@ -178,6 +190,26 @@ class TestConfig : public BasicTestConfig { (*this)(&(val->get())); } } + void operator()(std::vector<std::string> const* val) const { + if (val->empty()) { + evt.detail(key.c_str(), "[]"); + return; + } + std::stringstream value; + value << "[" << val->at(0); + for (int i = 1; i < val->size(); ++i) { + value << "," << val->at(i); + } + value << "]"; + evt.detail(key.c_str(), value.str()); + } + void operator()(Optional<std::vector<std::string>> const* val) const { + if (!val->present()) { + evt.detail(key.c_str(), *val); + } else { + (*this)(&(val->get())); + } + } void operator()(ConfigDBType const* val) const { evt.detail(key.c_str(), *val); } void operator()(Optional<ConfigDBType> const* val) const { Optional<std::string> optStr; @@ -312,12 +344,24 @@ class TestConfig : public BasicTestConfig { if (attrib == "blobGranulesEnabled") { blobGranulesEnabled = strcmp(value.c_str(), "true") == 0; } + if (attrib == "allowDefaultTenant") { + allowDefaultTenant = strcmp(value.c_str(), "true") == 0; + } + if (attrib == "allowCreatingTenants") { + allowCreatingTenants = strcmp(value.c_str(), "true") == 0; + } if (attrib == "injectSSTargetedRestart") { injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0; } - - if (attrib == "injectSSDelay") { - injectSSDelay = strcmp(value.c_str(), "true") == 0; + if (attrib == "tenantModes") { + std::stringstream ss(value); + std::string token; + while (std::getline(ss, token, ',')) { + tenantModes.push_back(token); + } + } + if (attrib == "defaultTenant") { + defaultTenant = value; } } @@ -365,11 +409,14 @@ public: bool randomlyRenameZoneId = false; bool allowDefaultTenant = true; - bool allowDisablingTenants = true; bool allowCreatingTenants = true; bool injectTargetedSSRestart = false; - bool tenantModeRequired = false; bool injectSSDelay = false; + // By default, tenant mode is set randomly + // If provided, set using TenantMode::fromString + // Ensure no '_experimental` suffix in the mode name + std::vector<std::string> tenantModes; + Optional<std::string> defaultTenant; std::string testClass; // unused -- used in TestHarness float testPriority; // unused -- used in TestHarness @@ -432,12 +479,12 @@ public: .add("extraMachineCountDC", &extraMachineCountDC) .add("blobGranulesEnabled", &blobGranulesEnabled) .add("allowDefaultTenant", &allowDefaultTenant) - .add("allowDisablingTenants", &allowDisablingTenants) .add("allowCreatingTenants", &allowCreatingTenants) - .add("tenantModeRequired", &tenantModeRequired) .add("randomlyRenameZoneId", &randomlyRenameZoneId) .add("injectTargetedSSRestart", &injectTargetedSSRestart) - .add("injectSSDelay", &injectSSDelay); + .add("injectSSDelay", &injectSSDelay) + .add("tenantModes", &tenantModes) + .add("defaultTenant", &defaultTenant); try { auto file = toml::parse(testFile); if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { @@ -1118,18 +1165,18 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor int* pTesterCount, Optional<ClusterConnectionString>* pConnString, Standalone<StringRef>* pStartingConfiguration, - TestConfig testConfig, + TestConfig* testConfig, std::string whitelistBinPaths, ProtocolVersion protocolVersion) { CSimpleIni ini; ini.SetUnicode(); ini.LoadFile(joinPath(baseFolder, "restartInfo.ini").c_str()); - auto configDBType = testConfig.getConfigDBType(); + auto configDBType = testConfig->getConfigDBType(); // Randomly change data center id names to test that localities // can be modified on cluster restart - bool renameZoneIds = testConfig.randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false; + bool renameZoneIds = testConfig->randomlyRenameZoneId ? deterministicRandom()->random01() < 0.1 : false; CODE_PROBE(renameZoneIds, "Zone ID names altered in restart test"); // allows multiple ipAddr entries @@ -1146,26 +1193,34 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor int desiredCoordinators = atoi(ini.GetValue("META", "desiredCoordinators")); int testerCount = atoi(ini.GetValue("META", "testerCount")); auto tssModeStr = ini.GetValue("META", "tssMode"); + auto tenantMode = ini.GetValue("META", "tenantMode"); + if (tenantMode != nullptr) { + testConfig->tenantModes.push_back(tenantMode); + } + std::string defaultTenant = ini.GetValue("META", "defaultTenant", ""); + if (!defaultTenant.empty()) { + testConfig->defaultTenant = defaultTenant; + } if (tssModeStr != nullptr) { g_simulator->tssMode = (ISimulator::TSSMode)atoi(tssModeStr); } ClusterConnectionString conn(ini.GetValue("META", "connectionString")); - if (testConfig.extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) { + if (testConfig->extraDatabaseMode == ISimulator::ExtraDatabaseMode::Local) { g_simulator->extraDatabases.clear(); g_simulator->extraDatabases.push_back(conn.toString()); } - if (!testConfig.disableHostname) { + if (!testConfig->disableHostname) { auto mockDNSStr = ini.GetValue("META", "mockDNS"); if (mockDNSStr != nullptr) { INetworkConnections::net()->parseMockDNSFromString(mockDNSStr); } } auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection(); - if (testConfig.disableRemoteKVS) { + if (testConfig->disableRemoteKVS) { g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false })); TraceEvent(SevDebug, "DisableRemoteKVS"); } - if (testConfig.disableEncryption) { + if (testConfig->disableEncryption) { g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false })); g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false })); g_knobs.setKnob("enable_storage_server_encryption", KnobValueRef::create(bool{ false })); @@ -2451,9 +2506,7 @@ ACTOR void setupAndRun(std::string dataFolder, allowList.addTrustedSubnet("0.0.0.0/2"sv); allowList.addTrustedSubnet("abcd::/16"sv); state bool allowDefaultTenant = testConfig.allowDefaultTenant; - state bool allowDisablingTenants = testConfig.allowDisablingTenants; state bool allowCreatingTenants = testConfig.allowCreatingTenants; - state bool tenantModeRequired = testConfig.tenantModeRequired; if (!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { testConfig.storageEngineExcludeTypes.push_back(5); @@ -2465,12 +2518,6 @@ ACTOR void setupAndRun(std::string dataFolder, if (std::string_view(testFile).find("restarting") != std::string_view::npos) { testConfig.storageEngineExcludeTypes.push_back(4); testConfig.storageEngineExcludeTypes.push_back(5); - - // Disable the default tenant in restarting tests for now - // TODO: persist the chosen default tenant in the restartInfo.ini file for the second test - allowDefaultTenant = false; - allowCreatingTenants = false; - tenantModeRequired = false; } // TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine @@ -2520,31 +2567,28 @@ ACTOR void setupAndRun(std::string dataFolder, state Optional<TenantName> defaultTenant; state Standalone<VectorRef<TenantNameRef>> tenantsToCreate; state TenantMode tenantMode = TenantMode::DISABLED; - if (tenantModeRequired || (allowDefaultTenant && deterministicRandom()->random01() < 0.5)) { - defaultTenant = "SimulatedDefaultTenant"_sr; - tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get()); - if (tenantModeRequired || deterministicRandom()->random01() < 0.9) { - tenantMode = TenantMode::REQUIRED; - } else { + // If this is a restarting test, restartInfo.ini is read in restartSimulatedSystem + // where we update the defaultTenant and tenantMode in the testConfig + // Defer setting tenant mode and default tenant until later + if (!rebooting) { + if (testConfig.tenantModes.size()) { + auto randomPick = deterministicRandom()->randomChoice(testConfig.tenantModes); + tenantMode = TenantMode::fromString(randomPick); + if (tenantMode == TenantMode::REQUIRED && allowDefaultTenant) { + defaultTenant = "SimulatedDefaultTenant"_sr; + } + } else if (allowDefaultTenant && deterministicRandom()->coinflip()) { + defaultTenant = "SimulatedDefaultTenant"_sr; + if (deterministicRandom()->random01() < 0.9) { + tenantMode = TenantMode::REQUIRED; + } else { + tenantMode = TenantMode::OPTIONAL_TENANT; + } + } else if (deterministicRandom()->coinflip()) { tenantMode = TenantMode::OPTIONAL_TENANT; } - } else if (!allowDisablingTenants || deterministicRandom()->random01() < 0.5) { - tenantMode = TenantMode::OPTIONAL_TENANT; } - if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->random01() < 0.5) { - int numTenants = deterministicRandom()->randomInt(1, 6); - for (int i = 0; i < numTenants; ++i) { - tenantsToCreate.push_back_deep(tenantsToCreate.arena(), - TenantNameRef(format("SimulatedExtraTenant%04d", i))); - } - } - - TraceEvent("SimulatedClusterTenantMode") - .detail("UsingTenant", defaultTenant) - .detail("TenantRequired", tenantMode.toString()) - .detail("TotalTenants", tenantsToCreate.size()); - try { // systemActors.push_back( startSystemMonitor(dataFolder) ); if (rebooting) { @@ -2553,7 +2597,7 @@ ACTOR void setupAndRun(std::string dataFolder, &testerCount, &connectionString, &startingConfiguration, - testConfig, + &testConfig, whitelistBinPaths, protocolVersion), 100.0)); @@ -2574,6 +2618,31 @@ ACTOR void setupAndRun(std::string dataFolder, tenantMode); wait(delay(1.0)); // FIXME: WHY!!! //wait for machines to boot } + // restartSimulatedSystem can adjust some testConfig params related to tenants + // so set/overwrite those options if necessary here + if (rebooting && testConfig.tenantModes.size()) { + tenantMode = TenantMode::fromString(testConfig.tenantModes[0]); + } + if (testConfig.defaultTenant.present() && tenantMode != TenantMode::DISABLED && allowDefaultTenant) { + // Default tenant set by testConfig or restarting data in restartInfo.ini + defaultTenant = testConfig.defaultTenant.get(); + } + if (!rebooting) { + if (defaultTenant.present() && allowDefaultTenant) { + tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get()); + } + if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->coinflip()) { + int numTenants = deterministicRandom()->randomInt(1, 6); + for (int i = 0; i < numTenants; ++i) { + tenantsToCreate.push_back_deep(tenantsToCreate.arena(), + TenantNameRef(format("SimulatedExtraTenant%04d", i))); + } + } + } + TraceEvent("SimulatedClusterTenantMode") + .detail("UsingTenant", defaultTenant) + .detail("TenantMode", tenantMode.toString()) + .detail("TotalTenants", tenantsToCreate.size()); std::string clusterFileDir = joinPath(dataFolder, deterministicRandom()->randomUniqueID().toString()); platform::createDirectory(clusterFileDir); writeFile(joinPath(clusterFileDir, "fdb.cluster"), connectionString.get().toString()); diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index b226d016e1..690a0dc871 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -18,14 +18,16 @@ * limitations under the License. */ +#include <limits> +#include <string> + #include "fdbclient/SystemData.h" #include "fdbclient/FDBTypes.h" #include "fdbserver/DDTeamCollection.h" #include "fdbserver/TenantCache.h" #include "flow/flow.h" -#include <limits> -#include <string> -#include "flow/actorcompiler.h" +#include "flow/Trace.h" +#include "flow/actorcompiler.h" // This must be the last #include. class TenantCacheImpl { @@ -116,6 +118,63 @@ public: } } } + + ACTOR static Future<Void> monitorStorageUsage(TenantCache* tenantCache) { + TraceEvent(SevInfo, "StartingTenantCacheStorageUsageMonitor", tenantCache->id()).log(); + + state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; + state double lastTenantListFetchTime = now(); + + loop { + state double fetchStartTime = now(); + state std::vector<TenantName> tenants = tenantCache->getTenantList(); + state int i; + for (i = 0; i < tenants.size(); i++) { + state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]); + loop { + try { + state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys)); + tenantCache->tenantStorageMap[tenants[i]].usage = size; + break; + } catch (Error& e) { + TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e); + wait(tr.onError(e)); + } + } + } + + lastTenantListFetchTime = now(); + if (lastTenantListFetchTime - fetchStartTime > (2 * refreshInterval)) { + TraceEvent(SevWarn, "TenantCacheGetStorageUsageRefreshSlow", tenantCache->id()).log(); + } + wait(delay(refreshInterval)); + } + } + + ACTOR static Future<Void> monitorStorageQuota(TenantCache* tenantCache) { + TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log(); + + state Transaction tr(tenantCache->dbcx()); + + loop { + loop { + try { + state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); + for (auto const kv : currentQuotas) { + TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix); + int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned()); + tenantCache->tenantStorageMap[tenant].quota = quota; + } + tr.reset(); + break; + } catch (Error& e) { + TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e); + wait(tr.onError(e)); + } + } + wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL)); + } + } }; void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) { @@ -170,6 +229,14 @@ int TenantCache::cleanup() { return tenantsRemoved; } +std::vector<TenantName> TenantCache::getTenantList() const { + std::vector<TenantName> tenants; + for (const auto& [prefix, entry] : tenantCache) { + tenants.push_back(entry->name()); + } + return tenants; +} + std::string TenantCache::desc() const { std::string s("@Generation: "); s += std::to_string(generation) + " "; @@ -216,10 +283,28 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const { return it->value; } +std::vector<TenantName> TenantCache::getTenantsOverQuota() const { + std::vector<TenantName> tenants; + for (const auto& [tenant, storage] : tenantStorageMap) { + if (storage.usage > storage.quota) { + tenants.push_back(tenant); + } + } + return tenants; +} + Future<Void> TenantCache::monitorTenantMap() { return TenantCacheImpl::monitorTenantMap(this); } +Future<Void> TenantCache::monitorStorageUsage() { + return TenantCacheImpl::monitorStorageUsage(this); +} + +Future<Void> TenantCache::monitorStorageQuota() { + return TenantCacheImpl::monitorStorageQuota(this); +} + class TenantCacheUnitTest { public: ACTOR static Future<Void> InsertAndTestPresence() { diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index dbe1973b3e..7126601e3f 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -153,6 +153,15 @@ public: ~PriorityMultiLock() { prioritylock_printf("destruct"); } + void kill() { + brokenOnDestruct.sendError(broken_promise()); + fRunner.cancel(); + runners.clear(); + for (auto& w : waiters) { + w.clear(); + } + } + Future<Lock> lock(int priority = 0) { prioritylock_printf("lock begin %s\n", toString().c_str()); @@ -1806,9 +1815,17 @@ ACTOR Future<Void> redwoodMetricsLogger() { } // Holds an index of recently used objects. -// ObjectType must have the methods -// bool evictable() const; // return true if the entry can be evicted -// Future<Void> onEvictable() const; // ready when entry can be evicted +// ObjectType must have these methods +// +// // Returns true iff the entry can be evicted +// bool evictable() const; +// +// // Ready when object is safe to evict from cache +// Future<Void> onEvictable() const; +// +// // Ready when object destruction is safe +// // Should cancel pending async operations that are safe to cancel when cache is being destroyed +// Future<Void> cancel() const; template <class IndexType, class ObjectType> class ObjectCache : NonCopyable { struct Entry; @@ -2031,7 +2048,7 @@ public: } // Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it. - ACTOR static Future<Void> clear_impl(ObjectCache* self) { + ACTOR static Future<Void> clear_impl(ObjectCache* self, bool waitForSafeEviction) { // Claim ownership of all of our cached items, removing them from the evictor's control and quota. for (auto& ie : self->cache) { self->pEvictor->reclaim(ie.second); @@ -2043,16 +2060,15 @@ public: state typename CacheT::iterator i = self->cache.begin(); while (i != self->cache.end()) { - if (!i->second.item.evictable()) { - wait(i->second.item.onEvictable()); - } + wait(waitForSafeEviction ? i->second.item.onEvictable() : i->second.item.cancel()); ++i; } + self->cache.clear(); return Void(); } - Future<Void> clear() { return clear_impl(this); } + Future<Void> clear(bool waitForSafeEviction = false) { return clear_impl(this, waitForSafeEviction); } // Move the prioritized evictions queued to the front of the eviction order void flushPrioritizedEvictions() { pEvictor->moveIn(prioritizedEvictions); } @@ -2113,6 +2129,13 @@ public: // Entry is evictable when its write and read futures are ready, even if they are // errors, so any buffers they hold are no longer needed by the underlying file actors Future<Void> onEvictable() const { return ready(readFuture) && ready(writeFuture); } + + // Read and write futures are safe to cancel so just cancel them and return + Future<Void> cancel() { + writeFuture.cancel(); + readFuture.cancel(); + return Void(); + } }; typedef ObjectCache<LogicalPageID, PageCacheEntry> PageCacheT; @@ -2660,14 +2683,15 @@ public: Future<LogicalPageID> newExtentPageID(QueueID queueID) override { return newExtentPageID_impl(this, queueID); } - ACTOR static Future<Void> writePhysicalBlock(DWALPager* self, - Reference<ArenaPage> page, - int blockNum, - int blockSize, - PhysicalPageID pageID, - PagerEventReasons reason, - unsigned int level, - bool header) { + // Write one block of a page of a physical page in the page file. Futures returned must be allowed to complete. + ACTOR static UNCANCELLABLE Future<Void> writePhysicalBlock(DWALPager* self, + Reference<ArenaPage> page, + int blockNum, + int blockSize, + PhysicalPageID pageID, + PagerEventReasons reason, + unsigned int level, + bool header) { state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority)); ++g_redwoodMetrics.metric.pagerDiskWrite; @@ -2691,7 +2715,11 @@ public: // Note: Not using forwardError here so a write error won't be discovered until commit time. debug_printf("DWALPager(%s) op=writeBlock %s\n", self->filename.c_str(), toString(pageID).c_str()); wait(self->pageFile->write(page->rawData() + (blockNum * blockSize), blockSize, (int64_t)pageID * blockSize)); - debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str()); + + // This next line could crash on shutdown because this actor can't be cancelled so self could be destroyed after + // write, so enable this line with caution when debugging. + // debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str()); + return Void(); } @@ -2715,6 +2743,7 @@ public: return Void(); } + // All returned futures are added to the operations vector Future<Void> writePhysicalPage(PagerEventReasons reason, unsigned int level, Standalone<VectorRef<PhysicalPageID>> pageIDs, @@ -2938,18 +2967,19 @@ public: } void freeExtent(LogicalPageID pageID) override { freeExtent_impl(this, pageID); } - ACTOR static Future<int> readPhysicalBlock(DWALPager* self, - uint8_t* data, - int blockSize, - int64_t offset, - int priority) { + ACTOR static UNCANCELLABLE Future<int> readPhysicalBlock(DWALPager* self, + Reference<ArenaPage> pageBuffer, + int pageOffset, + int blockSize, + int64_t offset, + int priority) { state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority))); ++g_redwoodMetrics.metric.pagerDiskRead; - int bytes = wait(self->pageFile->read(data, blockSize, offset)); + int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset)); return bytes; } - // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock + // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock. // If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages // and before the user-chosen sized pages. ACTOR static Future<Reference<ArenaPage>> readPhysicalPage(DWALPager* self, @@ -2966,8 +2996,8 @@ public: page->rawData(), header); - int readBytes = wait( - readPhysicalBlock(self, page->rawData(), page->rawSize(), (int64_t)pageID * page->rawSize(), priority)); + int readBytes = + wait(readPhysicalBlock(self, page, 0, page->rawSize(), (int64_t)pageID * page->rawSize(), priority)); debug_printf("DWALPager(%s) op=readPhysicalDiskReadComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), @@ -3030,8 +3060,8 @@ public: state int blockSize = self->physicalPageSize; std::vector<Future<int>> reads; for (int i = 0; i < pageIDs.size(); ++i) { - reads.push_back(readPhysicalBlock( - self, page->rawData() + (i * blockSize), blockSize, ((int64_t)pageIDs[i]) * blockSize, priority)); + reads.push_back( + readPhysicalBlock(self, page, i * blockSize, blockSize, ((int64_t)pageIDs[i]) * blockSize, priority)); } // wait for all the parallel read futures wait(waitForAll(reads)); @@ -3268,8 +3298,8 @@ public: currentOffset = i * physicalReadSize; debug_printf("DWALPager(%s) current offset %" PRId64 "\n", self->filename.c_str(), currentOffset); ++g_redwoodMetrics.metric.pagerDiskRead; - reads.push_back( - self->pageFile->read(extent->rawData() + currentOffset, physicalReadSize, startOffset + currentOffset)); + reads.push_back(self->readPhysicalBlock( + self, extent, currentOffset, physicalReadSize, startOffset + currentOffset, ioMaxPriority)); } // Handle the last read separately as it may be smaller than physicalReadSize @@ -3281,8 +3311,8 @@ public: currentOffset, lastReadSize); ++g_redwoodMetrics.metric.pagerDiskRead; - reads.push_back( - self->pageFile->read(extent->rawData() + currentOffset, lastReadSize, startOffset + currentOffset)); + reads.push_back(self->readPhysicalBlock( + self, extent, currentOffset, lastReadSize, startOffset + currentOffset, ioMaxPriority)); } // wait for all the parallel read futures for the given extent @@ -3747,30 +3777,36 @@ public: Value getCommitRecord() const override { return lastCommittedHeader.userCommitRecord; } ACTOR void shutdown(DWALPager* self, bool dispose) { + // Send to the error promise first and then delay(0) to give users a chance to cancel + // any outstanding operations + if (self->errorPromise.canBeSet()) { + debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); + self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + } + wait(delay(0)); + + // The next section explicitly cancels all pending operations held in the pager + debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str()); + self->ioLock.kill(); + debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); self->recoverFuture.cancel(); debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str()); self->commitFuture.cancel(); debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str()); self->remapCleanupFuture.cancel(); + debug_printf("DWALPager(%s) shutdown kill file extension\n", self->filename.c_str()); + self->fileExtension.cancel(); - if (self->errorPromise.canBeSet()) { - debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); - self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + debug_printf("DWALPager(%s) shutdown cancel operations\n", self->filename.c_str()); + for (auto& f : self->operations) { + f.cancel(); } - - // Must wait for pending operations to complete, canceling them can cause a crash because the underlying - // operations may be uncancellable and depend on memory from calling scope's page reference - debug_printf("DWALPager(%s) shutdown wait for operations\n", self->filename.c_str()); - - // Pending ops must be all ready, errors are okay - wait(waitForAllReady(self->operations)); self->operations.clear(); debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str()); wait(self->extentCache.clear()); wait(self->pageCache.clear()); - wait(delay(0)); debug_printf("DWALPager(%s) shutdown remappedPagesMap: %s\n", self->filename.c_str(), @@ -3995,7 +4031,11 @@ private: Promise<Void> closedPromise; Promise<Void> errorPromise; Future<Void> commitFuture; + + // The operations vector is used to hold all disk writes made by the Pager, but could also hold + // other operations that need to be waited on before a commit can finish. std::vector<Future<Void>> operations; + Future<Void> recoverFuture; Future<Void> remapCleanupFuture; bool remapCleanupStop; @@ -4767,7 +4807,7 @@ struct BoundaryRefAndPage { // DecodeBoundaryVerifier provides simulation-only verification of DeltaTree boundaries between // reads and writes by using a static structure to track boundaries used during DeltaTree generation // for all writes and updates across cold starts and virtual process restarts. -struct DecodeBoundaryVerifier { +class DecodeBoundaryVerifier { struct DecodeBoundaries { Key lower; Key upper; @@ -4778,11 +4818,13 @@ struct DecodeBoundaryVerifier { typedef std::map<Version, DecodeBoundaries> BoundariesByVersion; std::unordered_map<LogicalPageID, BoundariesByVersion> boundariesByPageID; - std::vector<Key> boundarySamples; int boundarySampleSize = 1000; int boundaryPopulation = 0; Reference<IPageEncryptionKeyProvider> keyProvider; +public: + std::vector<Key> boundarySamples; + // Sample rate of pages to be scanned to verify if all entries in the page meet domain prefix requirement. double domainPrefixScanProbability = 0.01; uint64_t domainPrefixScanCount = 0; @@ -4811,7 +4853,7 @@ struct DecodeBoundaryVerifier { if (boundarySamples.empty()) { return Key(); } - return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())]; + return deterministicRandom()->randomChoice(boundarySamples); } bool update(BTreeNodeLinkRef id, @@ -5377,6 +5419,15 @@ public: Future<Void> init() { return m_init; } virtual ~VersionedBTree() { + // DecodeBoundaryVerifier objects outlive simulated processes. + // Thus, if we did not clear the key providers here, each DecodeBoundaryVerifier object might + // maintain references to untracked peers through its key provider. This would result in + // errors when FlowTransport::removePeerReference is called to remove a peer that is no + // longer tracked by FlowTransport::transport(). + if (m_pBoundaryVerifier != nullptr) { + m_pBoundaryVerifier->setKeyProvider(Reference<IPageEncryptionKeyProvider>()); + } + // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, // it will cancel init and commit and leave the pager alive but with potentially an incomplete set of // uncommitted writes so it should not be committed. @@ -8187,7 +8238,9 @@ public: Future<Void> getError() const override { return delayed(m_error.getFuture()); }; - void clear(KeyRangeRef range, const Arena* arena = 0) override { + void clear(KeyRangeRef range, + const StorageServerMetrics* storageMetrics = nullptr, + const Arena* arena = 0) override { debug_printf("CLEAR %s\n", printable(range).c_str()); m_tree->clear(range); } diff --git a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h index b7510bd0b1..c4a2bc2344 100644 --- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h +++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h @@ -140,9 +140,27 @@ private: Future<Void> collection; }; +// Defines granule info that interests full restore +struct BlobGranuleRestoreVersion { + // Two constructors required by VectorRef + BlobGranuleRestoreVersion() {} + BlobGranuleRestoreVersion(Arena& a, const BlobGranuleRestoreVersion& copyFrom) + : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version), + sizeInBytes(copyFrom.sizeInBytes) {} + + UID granuleID; + KeyRangeRef keyRange; + Version version; + int64_t sizeInBytes; +}; + +// Defines a vector for BlobGranuleVersion +typedef Standalone<VectorRef<BlobGranuleRestoreVersion>> BlobGranuleRestoreVersionVector; + ACTOR Future<Void> dumpManifest(Database db, Reference<BlobConnectionProvider> blobConn, int64_t epoch, int64_t seqNo); ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> blobConn); ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn); +ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn); inline bool isFullRestoreMode() { return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE; }; diff --git a/fdbserver/include/fdbserver/BlobMigratorInterface.h b/fdbserver/include/fdbserver/BlobMigratorInterface.h index 6e3cbe3c7c..5b9cb6b97a 100644 --- a/fdbserver/include/fdbserver/BlobMigratorInterface.h +++ b/fdbserver/include/fdbserver/BlobMigratorInterface.h @@ -30,23 +30,25 @@ struct BlobMigratorInterface { constexpr static FileIdentifier file_identifier = 869199; RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator; - RequestStream<ReplyPromise<Void>> waitFailure; LocalityData locality; UID uniqueID; + StorageServerInterface ssi; BlobMigratorInterface() {} - BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {} + BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) { + ssi.locality = l; + ssi.uniqueID = id; + } - void initEndpoints() {} + void initEndpoints() { ssi.initEndpoints(); } UID id() const { return uniqueID; } - NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); } + NetworkAddress address() const { return haltBlobMigrator.getEndpoint().getPrimaryAddress(); } bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); } bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); } template <class Archive> void serialize(Archive& ar) { - // StorageServerInterface::serialize(ar); - serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID); + serializer(ar, locality, uniqueID, haltBlobMigrator); } }; diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index e8016101e3..c5b39ee3eb 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -322,6 +322,9 @@ public: // Log physicalShard void logPhysicalShardCollection(); + // Checks if a physical shard exists. + bool physicalShardExists(uint64_t physicalShardID); + private: // Track physicalShard metrics by tracking keyRange metrics void updatePhysicalShardMetricsByKeyRange(KeyRange keyRange, @@ -481,10 +484,6 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize); // Determines the maximum shard size based on the size of the database int64_t getMaxShardSize(double dbSizeEstimate); -struct StorageQuotaInfo { - std::map<Key, uint64_t> quotaMap; -}; - #ifndef __INTEL_COMPILER #pragma endregion #endif diff --git a/fdbserver/include/fdbserver/IKeyValueStore.h b/fdbserver/include/fdbserver/IKeyValueStore.h index b9679d9f92..3069527d7a 100644 --- a/fdbserver/include/fdbserver/IKeyValueStore.h +++ b/fdbserver/include/fdbserver/IKeyValueStore.h @@ -29,6 +29,7 @@ #include "fdbserver/IClosable.h" #include "fdbserver/IPageEncryptionKeyProvider.actor.h" #include "fdbserver/ServerDBInfo.h" +#include "fdbserver/StorageMetrics.h" struct CheckpointRequest { const Version version; // The FDB version at which the checkpoint is created. @@ -52,7 +53,9 @@ public: // persistRangeMapping(). virtual bool shardAware() const { return false; } virtual void set(KeyValueRef keyValue, const Arena* arena = nullptr) = 0; - virtual void clear(KeyRangeRef range, const Arena* arena = nullptr) = 0; + virtual void clear(KeyRangeRef range, + const StorageServerMetrics* storageMetrics = nullptr, + const Arena* arena = nullptr) = 0; virtual Future<Void> canCommit() { return Void(); } virtual Future<Void> commit( bool sequential = false) = 0; // returns when prior sets and clears are (atomically) durable diff --git a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h index e110c480c8..745df0dc9d 100644 --- a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h +++ b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h @@ -390,7 +390,9 @@ struct RemoteIKeyValueStore : public IKeyValueStore { void set(KeyValueRef keyValue, const Arena* arena = nullptr) override { interf.set.send(IKVSSetRequest{ keyValue, ReplyPromise<Void>() }); } - void clear(KeyRangeRef range, const Arena* arena = nullptr) override { + void clear(KeyRangeRef range, + const StorageServerMetrics* storageMetrics = nullptr, + const Arena* arena = nullptr) override { interf.clear.send(IKVSClearRequest{ range, ReplyPromise<Void>() }); } diff --git a/fdbserver/include/fdbserver/TenantCache.h b/fdbserver/include/fdbserver/TenantCache.h index b15bdcbf7a..32c9a0eeb6 100644 --- a/fdbserver/include/fdbserver/TenantCache.h +++ b/fdbserver/include/fdbserver/TenantCache.h @@ -32,6 +32,12 @@ typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix; +struct Storage { + int64_t quota = std::numeric_limits<int64_t>::max(); + int64_t usage = 0; +}; +typedef std::unordered_map<TenantName, Storage> TenantStorageMap; + struct TenantCacheTenantCreated { KeyRange keys; Promise<bool> reply; @@ -50,6 +56,9 @@ private: uint64_t generation; TenantMapByPrefix tenantCache; + // Map from tenant names to storage quota and usage + TenantStorageMap tenantStorageMap; + // mark the start of a new sweep of the tenant cache void startRefresh(); @@ -62,6 +71,9 @@ private: // return count of tenants that were found to be stale and removed from the cache int cleanup(); + // return all the TenantName for all tenants stored in the cache + std::vector<TenantName> getTenantList() const; + UID id() const { return distributorID; } Database dbcx() const { return cx; } @@ -77,9 +89,16 @@ public: Future<Void> monitorTenantMap(); + Future<Void> monitorStorageUsage(); + + Future<Void> monitorStorageQuota(); + std::string desc() const; bool isTenantKey(KeyRef key) const; Optional<Reference<TCTenantInfo>> tenantOwning(KeyRef key) const; + + // Get the list of tenants where the storage bytes currently used is greater than the quota allocated + std::vector<TenantName> getTenantsOverQuota() const; }; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 5c4393de87..e62323ac35 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -86,6 +86,7 @@ #include "fdbserver/TransactionTagCounter.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" +#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "flow/ActorCollection.h" #include "flow/Arena.h" #include "flow/Error.h" @@ -2231,6 +2232,7 @@ ACTOR Future<Void> deleteCheckpointQ(StorageServer* self, Version version, Check // Serves FetchCheckpointRequests. ACTOR Future<Void> fetchCheckpointQ(StorageServer* self, FetchCheckpointRequest req) { state ICheckpointReader* reader = nullptr; + state int64_t totalSize = 0; TraceEvent("ServeFetchCheckpointBegin", self->thisServerID) .detail("CheckpointID", req.checkpointID) .detail("Token", req.token); @@ -2255,12 +2257,14 @@ ACTOR Future<Void> fetchCheckpointQ(StorageServer* self, FetchCheckpointRequest FetchCheckpointReply reply(req.token); reply.data = data; req.reply.send(reply); + totalSize += data.size(); } } catch (Error& e) { if (e.code() == error_code_end_of_stream) { req.reply.sendError(end_of_stream()); TraceEvent("ServeFetchCheckpointEnd", self->thisServerID) .detail("CheckpointID", req.checkpointID) + .detail("TotalSize", totalSize) .detail("Token", req.token); } else { TraceEvent(SevWarnAlways, "ServerFetchCheckpointFailure") @@ -5973,27 +5977,26 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results, Reference<BlobConnectionProvider> blobConn) { ASSERT(blobConn.isValid()); try { - state Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tryReadBlobGranules(tr, keys, fetchVersion)); - if (chunks.size() == 0) { throw blob_granule_transaction_too_old(); // no data on blob } - if (!isRangeFullyCovered(keys, chunks)) { throw blob_granule_transaction_too_old(); } - - for (const BlobGranuleChunkRef& chunk : chunks) { - state KeyRangeRef chunkRange = chunk.keyRange; - state RangeResult rows = wait(readBlobGranule(chunk, keys, 0, fetchVersion, blobConn)); + state int i; + for (i = 0; i < chunks.size(); ++i) { + state KeyRangeRef chunkRange = chunks[i].keyRange; + state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn)); TraceEvent("ReadBlobData") .detail("Rows", rows.size()) .detail("ChunkRange", chunkRange.toString()) .detail("Keys", keys.toString()); - if (rows.size() == 0) { - rows.readThrough = KeyRef(rows.arena(), chunkRange.end); + rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end)); + } + if (i == chunks.size() - 1) { + rows.readThrough = KeyRef(rows.arena(), keys.end); } results.send(rows); } @@ -6007,7 +6010,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results, tr->reset(); tr->setVersion(fetchVersion); tr->trState->taskID = TaskPriority::FetchKeys; - wait(tryGetRange(results, tr, keys)); // fail back to storage server + throw; } return Void(); } @@ -6795,8 +6798,10 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) { // We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure // change feed mutations get applied correctly state std::vector<Key> changeFeedsToFetch; - std::vector<Key> _cfToFetch = wait(fetchCFMetadata); - changeFeedsToFetch = _cfToFetch; + if (!isFullRestoreMode()) { + std::vector<Key> _cfToFetch = wait(fetchCFMetadata); + changeFeedsToFetch = _cfToFetch; + } wait(data->durableVersionLock.take()); shard->phase = AddingShard::Fetching; @@ -9471,7 +9476,7 @@ void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned) } void StorageServerDisk::clearRange(KeyRangeRef keys) { - storage->clear(keys); + storage->clear(keys, &data->metrics); ++(*kvClearRanges); } @@ -9485,7 +9490,7 @@ void StorageServerDisk::writeMutation(MutationRef mutation) { storage->set(KeyValueRef(mutation.param1, mutation.param2)); *kvCommitLogicalBytes += mutation.expectedSize(); } else if (mutation.type == MutationRef::ClearRange) { - storage->clear(KeyRangeRef(mutation.param1, mutation.param2)); + storage->clear(KeyRangeRef(mutation.param1, mutation.param2), &data->metrics); ++(*kvClearRanges); } else ASSERT(false); @@ -9500,7 +9505,7 @@ void StorageServerDisk::writeMutations(const VectorRef<MutationRef>& mutations, storage->set(KeyValueRef(m.param1, m.param2)); *kvCommitLogicalBytes += m.expectedSize(); } else if (m.type == MutationRef::ClearRange) { - storage->clear(KeyRangeRef(m.param1, m.param2)); + storage->clear(KeyRangeRef(m.param1, m.param2), &data->metrics); ++(*kvClearRanges); } } @@ -9929,7 +9934,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor ++data->counters.kvSystemClearRanges; // TODO(alexmiller): Figure out how to selectively enable spammy data distribution events. // DEBUG_KEY_RANGE("clearInvalidVersion", invalidVersion, clearRange); - storage->clear(clearRange); + storage->clear(clearRange, &data->metrics); ++data->counters.kvSystemClearRanges; data->byteSampleApplyClear(clearRange, invalidVersion); } diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 4f10fe9972..b8b853dfd4 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -838,21 +838,25 @@ ACTOR Future<Void> testerServerCore(TesterInterface interf, ACTOR Future<Void> clearData(Database cx) { state Transaction tr(cx); state UID debugID = debugRandom()->randomUniqueID(); - TraceEvent("TesterClearingDatabaseStart", debugID).log(); tr.debugTransaction(debugID); + loop { try { + TraceEvent("TesterClearingDatabaseStart", debugID).log(); // This transaction needs to be self-conflicting, but not conflict consistently with // any other transactions tr.clear(normalKeys); tr.makeSelfConflicting(); - wait(success(tr.getReadVersion())); // required since we use addReadConflictRange but not get + Version rv = wait(tr.getReadVersion()); // required since we use addReadConflictRange but not get + TraceEvent("TesterClearingDatabaseRV", debugID).detail("RV", rv); wait(tr.commit()); TraceEvent("TesterClearingDatabase", debugID).detail("AtVersion", tr.getCommittedVersion()); break; } catch (Error& e) { TraceEvent(SevWarn, "TesterClearingDatabaseError", debugID).error(e); wait(tr.onError(e)); + debugID = debugRandom()->randomUniqueID(); + tr.debugTransaction(debugID); } } @@ -1274,7 +1278,9 @@ std::map<std::string, std::function<void(const std::string&)>> testSpecGlobalKey { "disableRemoteKVS", [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedRemoteKVS", ""); } }, { "disableEncryption", - [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedRemoteKVS", ""); } } + [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedEncryption", ""); } }, + { "allowDefaultTenant", + [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedDefaultTenant", ""); } } }; std::map<std::string, std::function<void(const std::string& value, TestSpec* spec)>> testSpecTestKeys = { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index be77fd8eaf..a6019b6cec 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -2267,7 +2267,25 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord, CODE_PROBE(true, "Recruited while already a blob migrator."); } else { startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id()); - DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.haltBlobMigrator); + DUMPTOKEN(recruited.ssi.getValue); + DUMPTOKEN(recruited.ssi.getKey); + DUMPTOKEN(recruited.ssi.getKeyValues); + DUMPTOKEN(recruited.ssi.getMappedKeyValues); + DUMPTOKEN(recruited.ssi.getShardState); + DUMPTOKEN(recruited.ssi.waitMetrics); + DUMPTOKEN(recruited.ssi.splitMetrics); + DUMPTOKEN(recruited.ssi.getReadHotRanges); + DUMPTOKEN(recruited.ssi.getRangeSplitPoints); + DUMPTOKEN(recruited.ssi.getStorageMetrics); + DUMPTOKEN(recruited.ssi.waitFailure); + DUMPTOKEN(recruited.ssi.getQueuingMetrics); + DUMPTOKEN(recruited.ssi.getKeyValueStoreType); + DUMPTOKEN(recruited.ssi.watchValue); + DUMPTOKEN(recruited.ssi.getKeyValuesStream); + DUMPTOKEN(recruited.ssi.changeFeedStream); + DUMPTOKEN(recruited.ssi.changeFeedPop); + DUMPTOKEN(recruited.ssi.changeFeedVersionUpdate); Future<Void> blobMigratorProcess = blobMigrator(recruited, dbInfo); errorForwarders.add(forwardError(errors, diff --git a/fdbserver/workloads/SaveAndKill.actor.cpp b/fdbserver/workloads/SaveAndKill.actor.cpp index 08e478c203..60eeef02bb 100644 --- a/fdbserver/workloads/SaveAndKill.actor.cpp +++ b/fdbserver/workloads/SaveAndKill.actor.cpp @@ -50,9 +50,9 @@ struct SaveAndKillWorkload : TestWorkload { g_simulator->disableSwapsToAll(); return Void(); } - Future<Void> start(Database const& cx) override { return _start(this); } + Future<Void> start(Database const& cx) override { return _start(this, cx); } - ACTOR Future<Void> _start(SaveAndKillWorkload* self) { + ACTOR Future<Void> _start(SaveAndKillWorkload* self, Database cx) { state int i; wait(delay(deterministicRandom()->random01() * self->testDuration)); @@ -68,6 +68,10 @@ struct SaveAndKillWorkload : TestWorkload { ini.SetValue("META", "testerCount", format("%d", g_simulator->testerCount).c_str()); ini.SetValue("META", "tssMode", format("%d", g_simulator->tssMode).c_str()); ini.SetValue("META", "mockDNS", INetworkConnections::net()->convertMockDNSToString().c_str()); + ini.SetValue("META", "tenantMode", cx->clientInfo->get().tenantMode.toString().c_str()); + if (cx->defaultTenant.present()) { + ini.SetValue("META", "defaultTenant", cx->defaultTenant.get().toString().c_str()); + } ini.SetBoolValue("META", "enableEncryption", SERVER_KNOBS->ENABLE_ENCRYPTION); ini.SetBoolValue("META", "enableTLogEncryption", SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION); diff --git a/fdbserver/workloads/StorageQuota.actor.cpp b/fdbserver/workloads/StorageQuota.actor.cpp index 1623083412..469832550e 100644 --- a/fdbserver/workloads/StorageQuota.actor.cpp +++ b/fdbserver/workloads/StorageQuota.actor.cpp @@ -38,17 +38,17 @@ struct StorageQuotaWorkload : TestWorkload { wait(setStorageQuotaHelper(cx, "name2"_sr, 200)); wait(setStorageQuotaHelper(cx, "name1"_sr, 300)); - state Optional<uint64_t> quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr)); + state Optional<int64_t> quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr)); ASSERT(quota1.present() && quota1.get() == 300); - state Optional<uint64_t> quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr)); + state Optional<int64_t> quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr)); ASSERT(quota2.present() && quota2.get() == 200); - state Optional<uint64_t> quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr)); + state Optional<int64_t> quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr)); ASSERT(!quota3.present()); return Void(); } - ACTOR static Future<Void> setStorageQuotaHelper(Database cx, StringRef tenantName, uint64_t quota) { + ACTOR static Future<Void> setStorageQuotaHelper(Database cx, StringRef tenantName, int64_t quota) { state Transaction tr(cx); loop { try { @@ -61,11 +61,11 @@ struct StorageQuotaWorkload : TestWorkload { } } - ACTOR static Future<Optional<uint64_t>> getStorageQuotaHelper(Database cx, StringRef tenantName) { + ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, StringRef tenantName) { state Transaction tr(cx); loop { try { - state Optional<uint64_t> quota = wait(getStorageQuota(&tr, tenantName)); + state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantName)); wait(tr.commit()); return quota; } catch (Error& e) { diff --git a/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp b/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp index e7ddf6afd4..3c1dd2f973 100644 --- a/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp +++ b/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp @@ -75,9 +75,11 @@ struct SSCheckpointRestoreWorkload : TestWorkload { state KeyRange testRange = KeyRangeRef(key, endKey); state std::vector<CheckpointMetaData> records; + TraceEvent("TestCheckpointRestoreBegin"); int ignore = wait(setDDMode(cx, 0)); state Version version = wait(self->writeAndVerify(self, cx, key, oldValue)); + TraceEvent("TestCreatingCheckpoint").detail("Range", testRange); // Create checkpoint. state Transaction tr(cx); state CheckpointFormat format = deterministicRandom()->coinflip() ? RocksDBColumnFamily : RocksDB; diff --git a/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp b/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp index 67bd5fd865..d827653573 100644 --- a/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp +++ b/fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp @@ -18,11 +18,15 @@ * limitations under the License. */ +#include "fdbclient/CommitProxyInterface.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/DatabaseConfiguration.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/Knobs.h" #include "fdbclient/TenantEntryCache.actor.h" +#include "fdbrpc/TenantName.h" #include "fdbserver/workloads/workloads.actor.h" #include "flow/Error.h" @@ -78,8 +82,9 @@ struct TenantEntryCacheWorkload : TestWorkload { return Void(); } - ACTOR static Future<Void> testTenantNotFound(Database cx) { - state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload); + ACTOR static Future<Void> testTenantNotFound(Database cx, TenantEntryCacheRefreshMode refreshMode) { + state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( + cx, deterministicRandom()->randomUniqueID(), createPayload, refreshMode); TraceEvent("TenantNotFoundStart"); wait(cache->init()); @@ -93,18 +98,16 @@ struct TenantEntryCacheWorkload : TestWorkload { Optional<TenantEntryCachePayload<int64_t>> value1 = wait(cache->getByPrefix(dummy.prefix)); ASSERT(!value1.present()); - // Ensure associated counter values gets updated - ASSERT_EQ(cache->numRefreshByMisses(), 2); - TraceEvent("TenantNotFoundEnd"); return Void(); } - ACTOR static Future<Void> testCreateTenantsAndLookup( - Database cx, - TenantEntryCacheWorkload* self, - std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) { - state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload); + ACTOR static Future<Void> testCreateTenantsAndLookup(Database cx, + TenantEntryCacheWorkload* self, + std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList, + TenantEntryCacheRefreshMode refreshMode) { + state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( + cx, deterministicRandom()->randomUniqueID(), createPayload, refreshMode); state int nTenants = deterministicRandom()->randomInt(5, self->maxTenants); TraceEvent("CreateTenantsAndLookupStart"); @@ -140,8 +143,10 @@ struct TenantEntryCacheWorkload : TestWorkload { ACTOR static Future<Void> testTenantInsert(Database cx, TenantEntryCacheWorkload* self, - std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) { - state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload); + std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList, + TenantEntryCacheRefreshMode refreshMode) { + state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( + cx, deterministicRandom()->randomUniqueID(), createPayload, refreshMode); ASSERT(!tenantList->empty() && tenantList->size() >= 2); @@ -187,8 +192,10 @@ struct TenantEntryCacheWorkload : TestWorkload { } ACTOR static Future<Void> testCacheReload(Database cx, - std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) { - state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>(cx, createPayload); + std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList, + TenantEntryCacheRefreshMode refreshMode) { + state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( + cx, deterministicRandom()->randomUniqueID(), createPayload, refreshMode); ASSERT(!tenantList->empty()); @@ -243,6 +250,26 @@ struct TenantEntryCacheWorkload : TestWorkload { return Void(); } + ACTOR static Future<Void> testCacheTenantsDisabled(Database cx) { + ASSERT(cx->clientInfo->get().tenantMode == TenantMode::DISABLED); + state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( + cx, deterministicRandom()->randomUniqueID(), createPayload, TenantEntryCacheRefreshMode::NONE); + + TraceEvent("TestCacheTenantDisabledStart"); + + wait(cache->init()); + // Ensure associated counter values gets updated + ASSERT_EQ(cache->numRefreshByInit(), 1); + ASSERT_GE(cache->numCacheRefreshes(), 1); + + Optional<TenantEntryCachePayload<int64_t>> entry = wait(cache->getById(12)); + ASSERT(!entry.present()); + ASSERT_EQ(cache->numCacheRefreshes(), 1); + + TraceEvent("TestCacheTenantDisabledEnd"); + return Void(); + } + ACTOR static Future<Void> tenantEntryRemove(Database cx, std::vector<std::pair<TenantName, TenantMapEntry>>* tenantList) { state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( @@ -288,6 +315,39 @@ struct TenantEntryCacheWorkload : TestWorkload { return Void(); } + ACTOR static Future<Void> testCacheWatchRefresh(Database cx) { + state Reference<TenantEntryCache<int64_t>> cache = makeReference<TenantEntryCache<int64_t>>( + cx, deterministicRandom()->randomUniqueID(), createPayload, TenantEntryCacheRefreshMode::WATCH); + wait(cache->init(true)); + // Ensure associated counter values gets updated + ASSERT_EQ(cache->numRefreshByInit(), 1); + ASSERT_GE(cache->numCacheRefreshes(), 1); + + // Create tenant and make sure the cache is updated + state TenantName name = "TenantEntryCache_WatchRefresh"_sr; + state Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(cx.getReference(), name)); + ASSERT(entry.present()); + + state int64_t startTime = now(); + state int64_t waitUntill = startTime + 300; // 5 mins max wait + loop { + if (cache->numWatchRefreshes() >= 1) { + break; + } + + if (now() > waitUntill) { + throw timed_out(); + } + + TraceEvent("TestCacheRefreshWait").detail("Elapsed", now() - startTime); + wait(delay(CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL)); + } + Optional<TenantEntryCachePayload<int64_t>> payload = wait(cache->getByName(name)); + ASSERT(payload.present()); + compareTenants(payload, entry.get()); + return Void(); + } + Future<Void> setup(Database const& cx) override { if (clientId == 0 && g_network->isSimulated() && BUGGIFY) { IKnobCollection::getMutableGlobalKnobCollection().setKnob("tenant_entry_cache_list_refresh_interval", @@ -306,13 +366,40 @@ struct TenantEntryCacheWorkload : TestWorkload { ACTOR Future<Void> _start(Database cx, TenantEntryCacheWorkload* self) { state std::vector<std::pair<TenantName, TenantMapEntry>> tenantList; + state TenantEntryCacheRefreshMode refreshMode; + if (deterministicRandom()->coinflip()) { + refreshMode = TenantEntryCacheRefreshMode::PERIODIC_TASK; + } else { + refreshMode = TenantEntryCacheRefreshMode::WATCH; + } + // get the tenant mode from db config + state Transaction tr = Transaction(cx); + state DatabaseConfiguration configuration; + loop { + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + RangeResult results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY); + configuration.fromKeyValues((VectorRef<KeyValueRef>)results); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } - wait(testTenantNotFound(cx)); - wait(testCreateTenantsAndLookup(cx, self, &tenantList)); - wait(testTenantInsert(cx, self, &tenantList)); - wait(tenantEntryRemove(cx, &tenantList)); - wait(testTenantCacheDefaultFunc(cx)); - wait(testCacheRefresh(cx)); + if (configuration.tenantMode == TenantMode::DISABLED) { + wait(testCacheTenantsDisabled(cx)); + } else { + wait(testTenantNotFound(cx, refreshMode)); + wait(testCreateTenantsAndLookup(cx, self, &tenantList, refreshMode)); + wait(testTenantInsert(cx, self, &tenantList, refreshMode)); + wait(tenantEntryRemove(cx, &tenantList)); + wait(testTenantCacheDefaultFunc(cx)); + wait(testCacheRefresh(cx)); + wait(testCacheWatchRefresh(cx)); + } return Void(); } diff --git a/fdbserver/workloads/TenantManagementWorkload.actor.cpp b/fdbserver/workloads/TenantManagementWorkload.actor.cpp index 1d6c7786de..a0cdf8144f 100644 --- a/fdbserver/workloads/TenantManagementWorkload.actor.cpp +++ b/fdbserver/workloads/TenantManagementWorkload.actor.cpp @@ -23,11 +23,13 @@ #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/FDBOptions.g.h" +#include "fdbclient/FDBTypes.h" #include "fdbclient/GenericManagementAPI.actor.h" #include "fdbclient/KeyBackedTypes.h" #include "fdbclient/MetaclusterManagement.actor.h" #include "fdbclient/ReadYourWrites.h" #include "fdbclient/RunTransaction.actor.h" +#include "fdbclient/Tenant.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/TenantSpecialKeys.actor.h" #include "fdbclient/ThreadSafeTransaction.h" @@ -242,6 +244,56 @@ struct TenantManagementWorkload : TestWorkload { return Void(); } + ACTOR template <class DB> + static Future<Versionstamp> getLastTenantModification(Reference<DB> db, OperationType type) { + state Reference<typename DB::TransactionT> tr = db->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + if (type == OperationType::METACLUSTER) { + Versionstamp vs = + wait(MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().lastTenantModification.getD( + tr, Snapshot::False, Versionstamp())); + return vs; + } + Versionstamp vs = + wait(TenantMetadata::lastTenantModification().getD(tr, Snapshot::False, Versionstamp())); + return vs; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + ACTOR template <class DB> + static Future<Version> getLatestReadVersion(Reference<DB> db, OperationType type) { + state Reference<typename DB::TransactionT> tr = db->createTransaction(); + loop { + try { + Version readVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); + return readVersion; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + static Future<Versionstamp> getLastTenantModification(TenantManagementWorkload* self, OperationType type) { + if (type == OperationType::METACLUSTER) { + return getLastTenantModification(self->mvDb, type); + } else { + return getLastTenantModification(self->dataDb.getReference(), type); + } + } + + static Future<Version> getLatestReadVersion(TenantManagementWorkload* self, OperationType type) { + if (type == OperationType::METACLUSTER) { + return getLatestReadVersion(self->mvDb, type); + } else { + return getLatestReadVersion(self->dataDb.getReference(), type); + } + } + TenantName chooseTenantName(bool allowSystemTenant) { TenantName tenant(format( "%s%08d", localTenantNamePrefix.toString().c_str(), deterministicRandom()->randomInt(0, maxTenants))); @@ -373,6 +425,7 @@ struct TenantManagementWorkload : TestWorkload { state int64_t minTenantCount = std::numeric_limits<int64_t>::max(); state int64_t finalTenantCount = 0; + state Version originalReadVersion = wait(self->getLatestReadVersion(self, operationType)); loop { try { // First, attempt to create the tenants @@ -465,6 +518,9 @@ struct TenantManagementWorkload : TestWorkload { ASSERT(entry.get().tenantGroup == tenantItr->second.tenantGroup); ASSERT(entry.get().tenantState == TenantState::READY); + Versionstamp currentVersionstamp = wait(getLastTenantModification(self, operationType)); + ASSERT_GT(currentVersionstamp.version, originalReadVersion); + if (self->useMetacluster) { // In a metacluster, we should also see that the tenant was created on the data cluster Optional<TenantMapEntry> dataEntry = @@ -709,6 +765,7 @@ struct TenantManagementWorkload : TestWorkload { return Void(); } + state Version originalReadVersion = wait(self->getLatestReadVersion(self, operationType)); loop { try { // Attempt to delete the tenant(s) @@ -801,6 +858,11 @@ struct TenantManagementWorkload : TestWorkload { // Deletion should not succeed if any tenant in the range wasn't empty ASSERT(isEmpty); + if (tenants.size() > 0) { + Versionstamp currentVersionstamp = wait(getLastTenantModification(self, operationType)); + ASSERT_GT(currentVersionstamp.version, originalReadVersion); + } + // Update our local state to remove the deleted tenants for (auto tenant : tenants) { auto itr = self->createdTenants.find(tenant); @@ -1221,11 +1283,14 @@ struct TenantManagementWorkload : TestWorkload { } } + state Version originalReadVersion = wait(self->getLatestReadVersion(self, operationType)); loop { try { wait(renameTenantImpl( tr, operationType, tenantRenames, tenantNotFound, tenantExists, tenantOverlap, self)); wait(verifyTenantRenames(self, tenantRenames)); + Versionstamp currentVersionstamp = wait(getLastTenantModification(self, operationType)); + ASSERT_GT(currentVersionstamp.version, originalReadVersion); // Check that using the wrong rename API fails depending on whether we are using a metacluster ASSERT(self->useMetacluster == (operationType == OperationType::METACLUSTER)); return Void(); @@ -1361,6 +1426,7 @@ struct TenantManagementWorkload : TestWorkload { configuration["invalid_option"_sr] = ""_sr; } + state Version originalReadVersion = wait(self->getLatestReadVersion(self, operationType)); loop { try { wait(configureTenantImpl(tr, tenant, configuration, operationType, specialKeysUseInvalidTuple, self)); @@ -1369,6 +1435,8 @@ struct TenantManagementWorkload : TestWorkload { ASSERT(!hasInvalidOption); ASSERT(!hasSystemTenantGroup); ASSERT(!specialKeysUseInvalidTuple); + Versionstamp currentVersionstamp = wait(getLastTenantModification(self, operationType)); + ASSERT_GT(currentVersionstamp.version, originalReadVersion); auto itr = self->createdTenants.find(tenant); if (itr->second.tenantGroup.present()) { diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 4a4080a729..62ce837478 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -47,14 +47,6 @@ endif() add_flow_target(LINK_TEST NAME flowlinktest SRCS LinkTest.cpp) target_link_libraries(flowlinktest PRIVATE flow stacktrace) -#find_package(ZLIB) -#if(ZLIB_FOUND) -# target_compile_definitions(flow PUBLIC ZLIB_LIB_SUPPORTED) -# target_link_libraries(flow PUBLIC ZLIB::ZLIB) -#else() -# message(STATUS "ZLIB package not found") -#endif() - set(IS_ARM_MAC NO) if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") set(IS_ARM_MAC YES) diff --git a/flow/include/flow/IAsyncFile.h b/flow/include/flow/IAsyncFile.h index 2c2534d210..94785c0a40 100644 --- a/flow/include/flow/IAsyncFile.h +++ b/flow/include/flow/IAsyncFile.h @@ -33,6 +33,10 @@ // AsyncFileCached) maintain references, while others (AsyncFileNonDurable) don't, and the comment // is unapplicable to some others as well (AsyncFileKAIO). It's safest to assume that all operations // must complete or cancel, but you should probably look at the file implementations you'll be using. +// +// Wrapper classes such as AsyncFileWriteChecker and AsyncFileChaos ideally should be safe to destroy with +// pending operations as they are inserted randomly into file opens in simulation and some use cases +// will drop file references while operations are still pending. class IAsyncFile { public: virtual ~IAsyncFile(); diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index 396723c60c..25d77a8bad 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -206,7 +206,7 @@ ERROR( key_not_tuple, 2041, "The key cannot be parsed as a tuple" ); ERROR( value_not_tuple, 2042, "The value cannot be parsed as a tuple" ); ERROR( mapper_not_tuple, 2043, "The mapper cannot be parsed as a tuple" ); ERROR( invalid_checkpoint_format, 2044, "Invalid checkpoint format" ) -ERROR( invalid_throttle_quota_value, 2045, "Failed to deserialize or initialize throttle quota value" ) +ERROR( invalid_throttle_quota_value, 2045, "Invalid quota value. Note that reserved_throughput cannot exceed total_throughput" ) ERROR( incompatible_protocol_version, 2100, "Incompatible protocol version" ) ERROR( transaction_too_large, 2101, "Transaction exceeds byte limit" ) diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index eb1ecabd40..077886606d 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -177,7 +177,8 @@ Future<Void> waitForAllReady(std::vector<Future<T>> results) { if (i == results.size()) return Void(); try { - wait(success(results[i])); + T t = wait(results[i]); + (void)t; } catch (...) { } i++; @@ -405,6 +406,20 @@ Future<Void> map(FutureStream<T> input, F func, PromiseStream<std::invoke_result return Void(); } +// X + Y will wait for X, then wait for and return the result of Y +ACTOR template <class A, class B> +Future<B> operatorPlus(Future<A> a, Future<B> b) { + A resultA = wait(a); + (void)resultA; + B resultB = wait(b); + return resultB; +} + +template <class A, class B> +Future<B> operator+(Future<A> a, Future<B> b) { + return operatorPlus(a, b); +} + // Returns if the future returns true, otherwise waits forever. ACTOR Future<Void> returnIfTrue(Future<bool> f); @@ -939,21 +954,22 @@ public: private: template <class U> - friend Future<Void> quorum(std::vector<Future<U>> const& results, int n); + friend Future<Void> quorum(const Future<U>* pItems, int itemCount, int n); Quorum<T>* head; QuorumCallback() = default; QuorumCallback(Future<T> future, Quorum<T>* head) : head(head) { future.addCallbackAndClear(this); } }; template <class T> -Future<Void> quorum(std::vector<Future<T>> const& results, int n) { - ASSERT(n >= 0 && n <= results.size()); +Future<Void> quorum(const Future<T>* pItems, int itemCount, int n) { + ASSERT(n >= 0 && n <= itemCount); - int size = Quorum<T>::sizeFor(results.size()); - Quorum<T>* q = new (allocateFast(size)) Quorum<T>(n, results.size()); + int size = Quorum<T>::sizeFor(itemCount); + Quorum<T>* q = new (allocateFast(size)) Quorum<T>(n, itemCount); QuorumCallback<T>* nextCallback = q->callbacks(); - for (auto& r : results) { + for (int i = 0; i < itemCount; ++i) { + auto& r = pItems[i]; if (r.isReady()) { new (nextCallback) QuorumCallback<T>(); nextCallback->next = 0; @@ -968,6 +984,11 @@ Future<Void> quorum(std::vector<Future<T>> const& results, int n) { return Future<Void>(q); } +template <class T> +Future<Void> quorum(std::vector<Future<T>> const& results, int n) { + return quorum(&results.front(), results.size(), n); +} + ACTOR template <class T> Future<Void> smartQuorum(std::vector<Future<T>> results, int required, @@ -989,6 +1010,15 @@ Future<Void> waitForAll(std::vector<Future<T>> const& results) { return quorum(results, (int)results.size()); } +// Wait for all futures in results to be ready and then throw the first (in execution order) error +// if any of them resulted in an error. +template <class T> +Future<Void> waitForAllReadyThenThrow(std::vector<Future<T>> const& results) { + Future<Void> f = waitForAll(results); + Future<Void> fReady = waitForAllReady(results); + return fReady + f; +} + template <class T> Future<Void> waitForAny(std::vector<Future<T>> const& results) { if (results.empty()) @@ -1053,7 +1083,8 @@ Future<Void> success(Future<T> of) { ACTOR template <class T> Future<Void> ready(Future<T> f) { try { - wait(success(f)); + T t = wait(f); + (void)t; } catch (...) { } return Void(); @@ -1178,7 +1209,8 @@ inline Future<Void> operator&&(Future<Void> const& lhs, Future<Void> const& rhs) return lhs; } - return waitForAll(std::vector<Future<Void>>{ lhs, rhs }); + Future<Void> x[] = { lhs, rhs }; + return quorum(x, 2, 2); } // error || unset -> error diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cba8b4169c..ba35cb9987 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -309,6 +309,9 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.2.0/DrUpgradeRestart-1.txt restarting/from_7.2.0/DrUpgradeRestart-2.txt) + add_fdb_test( + TEST_FILES restarting/to_7.2.0/CycleTestRestart-1.toml + restarting/to_7.2.0/CycleTestRestart-2.toml) add_fdb_test(TEST_FILES slow/ApiCorrectness.toml) @@ -332,6 +335,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES slow/DifferentClustersSameRV.toml) add_fdb_test(TEST_FILES slow/DiskFailureCycle.toml) add_fdb_test(TEST_FILES slow/FastTriggeredWatches.toml) + add_fdb_test(TEST_FILES slow/LongRunning.toml LONG_RUNNING) add_fdb_test(TEST_FILES slow/LowLatencyWithFailures.toml) add_fdb_test(TEST_FILES slow/MetaclusterManagement.toml) add_fdb_test(TEST_FILES slow/MoveKeysClean.toml) diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index 6375a7a01e..5ef9e0bfa0 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -213,8 +213,12 @@ logdir = {logdir} tls_config=self.tls_conf_string(), authz_public_key_config=self.authz_public_key_conf_string(), optional_tls=":tls" if self.tls_config is not None else "", - custom_config='\n'.join(["{} = {}".format(key, value) for key, value in self.custom_config.items()]), - use_future_protocol_version="use-future-protocol-version = true" if self.use_future_protocol_version else "", + custom_config="\n".join( + ["{} = {}".format(key, value) for key, value in self.custom_config.items()] + ), + use_future_protocol_version="use-future-protocol-version = true" + if self.use_future_protocol_version + else "", ) ) # By default, the cluster only has one process @@ -534,3 +538,29 @@ logdir = {logdir} self.save_config() self.wait_for_server_update() print("Old servers successfully removed from the cluster. Time: {}s".format(time.time() - start_time)) + + # Check the cluster log for errors + def check_cluster_logs(self, error_limit=100): + sev40s = subprocess.getoutput("grep -r 'Severity=\"40\"' {}".format(self.log.as_posix())).rstrip().splitlines() + + err_cnt = 0 + for line in sev40s: + # When running ASAN we expect to see this message. Boost coroutine should be using the + # correct asan annotations so that it shouldn't produce any false positives. + if line.endswith( + "WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false " + "positives in some cases!" + ): + continue + if err_cnt < error_limit: + print(line) + err_cnt += 1 + + if err_cnt > 0: + print( + ">>>>>>>>>>>>>>>>>>>> Found {} severity 40 events - the test fails", + err_cnt, + ) + else: + print("No errors found in logs") + return err_cnt == 0 diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py index 8f43e0cddf..3948c56533 100755 --- a/tests/TestRunner/upgrade_test.py +++ b/tests/TestRunner/upgrade_test.py @@ -145,8 +145,9 @@ class UpgradeTest: self.cluster.fdbmonitor_binary = self.downloader.binary_path(version, "fdbmonitor") self.cluster.fdbserver_binary = self.downloader.binary_path(version, "fdbserver") self.cluster.fdbcli_binary = self.downloader.binary_path(version, "fdbcli") - self.cluster.set_env_var("LD_LIBRARY_PATH", "%s:%s" % ( - self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH"))) + self.cluster.set_env_var( + "LD_LIBRARY_PATH", "%s:%s" % (self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH")) + ) self.cluster.use_legacy_conf_syntax = version_before(version, "7.1.0") self.cluster.use_future_protocol_version = version == FUTURE_VERSION self.cluster.save_config() @@ -325,36 +326,6 @@ class UpgradeTest: .splitlines() ) - # Check the cluster log for errors - def check_cluster_logs(self, error_limit=100): - sev40s = ( - subprocess.getoutput("grep -r 'Severity=\"40\"' {}".format(self.cluster.log.as_posix())) - .rstrip() - .splitlines() - ) - - err_cnt = 0 - for line in sev40s: - # When running ASAN we expect to see this message. Boost coroutine should be using the - # correct asan annotations so that it shouldn't produce any false positives. - if line.endswith( - "WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false " - "positives in some cases!" - ): - continue - if err_cnt < error_limit: - print(line) - err_cnt += 1 - - if err_cnt > 0: - print( - ">>>>>>>>>>>>>>>>>>>> Found {} severity 40 events - the test fails", - err_cnt, - ) - else: - print("No errors found in logs") - return err_cnt == 0 - # Check the server and client logs for warnings and dump them def dump_warnings_in_logs(self, limit=100): sev30s = ( @@ -454,7 +425,7 @@ if __name__ == "__main__": print("data-dir: {}".format(test.data)) print("cluster-file: {}".format(test.etc.joinpath("fdb.cluster"))) errcode = test.exec_test(args) - if not test.check_cluster_logs(): + if not test.cluster.check_cluster_logs(): errcode = 1 if errcode == 0 else errcode test.dump_warnings_in_logs() if errcode != 0 and not args.disable_log_dump: diff --git a/tests/fast/EncryptedBackupCorrectness.toml b/tests/fast/EncryptedBackupCorrectness.toml index fe2748eb3f..597532737e 100644 --- a/tests/fast/EncryptedBackupCorrectness.toml +++ b/tests/fast/EncryptedBackupCorrectness.toml @@ -1,10 +1,12 @@ [configuration] -allowDefaultTenant = false -allowDisablingTenants = false -tenantModeRequired = true +allowDefaultTenant = true +tenantModes = ['required'] [[knobs]] enable_encryption = true +enable_tlog_encryption = true +enable_storage_server_encryption = false +enable_blob_granule_encryption = true [[test]] testTitle = 'TenantCreation' diff --git a/tests/fast/FuzzApiCorrectness.toml b/tests/fast/FuzzApiCorrectness.toml index 36b8e6812f..c62382480d 100644 --- a/tests/fast/FuzzApiCorrectness.toml +++ b/tests/fast/FuzzApiCorrectness.toml @@ -1,7 +1,7 @@ [configuration] StderrSeverity = 30 -allowDisablingTenants = false allowDefaultTenant = false +tenantModes = ['optional', 'required'] [[test]] testTitle = 'FuzzApiCorrectness' diff --git a/tests/fast/FuzzApiCorrectnessClean.toml b/tests/fast/FuzzApiCorrectnessClean.toml index deddfacce3..50ef4e2d0e 100644 --- a/tests/fast/FuzzApiCorrectnessClean.toml +++ b/tests/fast/FuzzApiCorrectnessClean.toml @@ -1,7 +1,7 @@ [configuration] StderrSeverity = 30 -allowDisablingTenants = false allowDefaultTenant = false +tenantModes = ['optional', 'required'] [[test]] testTitle = 'FuzzApiCorrectness' diff --git a/tests/fast/GetEstimatedRangeSize.toml b/tests/fast/GetEstimatedRangeSize.toml index ef1466daca..a64499d8d0 100644 --- a/tests/fast/GetEstimatedRangeSize.toml +++ b/tests/fast/GetEstimatedRangeSize.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] [[test]] testTitle = 'TenantCreation' diff --git a/tests/fast/SpecialKeySpaceCorrectness.toml b/tests/fast/SpecialKeySpaceCorrectness.toml index fcabb8222e..cb70fe16ad 100644 --- a/tests/fast/SpecialKeySpaceCorrectness.toml +++ b/tests/fast/SpecialKeySpaceCorrectness.toml @@ -1,5 +1,5 @@ [configuration] -allowDisablingTenants = false +tenantModes = ['optional', 'required'] [[test]] testTitle = 'SpecialKeySpaceCorrectnessTest' diff --git a/tests/fast/TenantCycle.toml b/tests/fast/TenantCycle.toml index 9566ab7a6e..3805acc919 100644 --- a/tests/fast/TenantCycle.toml +++ b/tests/fast/TenantCycle.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] [[test]] testTitle = 'TenantCreation' diff --git a/tests/fast/TenantCycleTokenless.toml b/tests/fast/TenantCycleTokenless.toml index 6e192c1f7f..ec76713496 100644 --- a/tests/fast/TenantCycleTokenless.toml +++ b/tests/fast/TenantCycleTokenless.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] [[knobs]] allow_tokenless_tenant_access = true diff --git a/tests/fast/TenantEntryCache.toml b/tests/fast/TenantEntryCache.toml index c60423ef89..3128f256aa 100644 --- a/tests/fast/TenantEntryCache.toml +++ b/tests/fast/TenantEntryCache.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['disabled', 'optional'] [[test]] testTitle = 'TenantEntryCacheTest' diff --git a/tests/restarting/to_7.1.0/CycleTestRestart-1.toml b/tests/restarting/to_7.1.0/CycleTestRestart-1.toml index d74ea9ab5d..276657451d 100644 --- a/tests/restarting/to_7.1.0/CycleTestRestart-1.toml +++ b/tests/restarting/to_7.1.0/CycleTestRestart-1.toml @@ -4,6 +4,7 @@ maxTLogVersion = 6 disableTss = true disableHostname = true disableEncryption = true +allowDefaultTenant = false [[knobs]] # This can be removed once the lower bound of this downgrade test is a version that understands the new protocol diff --git a/tests/restarting/to_7.1.0/CycleTestRestart-2.toml b/tests/restarting/to_7.1.0/CycleTestRestart-2.toml index 05571e0606..7c54883745 100644 --- a/tests/restarting/to_7.1.0/CycleTestRestart-2.toml +++ b/tests/restarting/to_7.1.0/CycleTestRestart-2.toml @@ -1,6 +1,7 @@ [configuration] maxTLogVersion = 6 disableTss = true +allowDefaultTenant = false [[test]] testTitle = 'Clogged' diff --git a/tests/restarting/to_7.2.0/CycleTestRestart-1.toml b/tests/restarting/to_7.2.0/CycleTestRestart-1.toml new file mode 100644 index 0000000000..d74ea9ab5d --- /dev/null +++ b/tests/restarting/to_7.2.0/CycleTestRestart-1.toml @@ -0,0 +1,49 @@ +[configuration] +storageEngineExcludeTypes = [3] +maxTLogVersion = 6 +disableTss = true +disableHostname = true +disableEncryption = true + +[[knobs]] +# This can be removed once the lower bound of this downgrade test is a version that understands the new protocol +shard_encode_location_metadata = false + +[[test]] +testTitle = 'Clogged' +clearAfterTest = false + + [[test.workload]] + testName = 'Cycle' + transactionsPerSecond = 500.0 + nodeCount = 2500 + testDuration = 10.0 + expectedRate = 0 + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 10.0 + + [[test.workload]] + testName = 'Rollback' + meanDelay = 10.0 + testDuration = 10.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 10.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 10.0 + + [[test.workload]] + testName = 'SaveAndKill' + restartInfoLocation = 'simfdb/restartInfo.ini' + testDuration = 10.0 diff --git a/tests/restarting/to_7.2.0/CycleTestRestart-2.toml b/tests/restarting/to_7.2.0/CycleTestRestart-2.toml new file mode 100644 index 0000000000..05571e0606 --- /dev/null +++ b/tests/restarting/to_7.2.0/CycleTestRestart-2.toml @@ -0,0 +1,36 @@ +[configuration] +maxTLogVersion = 6 +disableTss = true + +[[test]] +testTitle = 'Clogged' +runSetup = false + + [[test.workload]] + testName = 'Cycle' + transactionsPerSecond = 2500.0 + nodeCount = 2500 + testDuration = 10.0 + expectedRate = 0 + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 10.0 + + [[test.workload]] + testName = 'Rollback' + meanDelay = 10.0 + testDuration = 10.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 10.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true diff --git a/tests/slow/BlobGranuleCorrectness.toml b/tests/slow/BlobGranuleCorrectness.toml index 00aa86b07d..e89c7bded5 100644 --- a/tests/slow/BlobGranuleCorrectness.toml +++ b/tests/slow/BlobGranuleCorrectness.toml @@ -1,7 +1,7 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point diff --git a/tests/slow/BlobGranuleCorrectnessClean.toml b/tests/slow/BlobGranuleCorrectnessClean.toml index 832e3d2b86..a42830bb33 100644 --- a/tests/slow/BlobGranuleCorrectnessClean.toml +++ b/tests/slow/BlobGranuleCorrectnessClean.toml @@ -1,7 +1,7 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [4, 5] diff --git a/tests/slow/LongRunning.toml b/tests/slow/LongRunning.toml new file mode 100644 index 0000000000..d4188a5aec --- /dev/null +++ b/tests/slow/LongRunning.toml @@ -0,0 +1,12 @@ +[[test]] +testTitle = 'CycleTestWithKills' + + [[test.workload]] + testName = 'Cycle' + transactionsPerSecond = 2500.0 + testDuration = 10000.0 + expectedRate = 0 + + [[test.workload]] + testName = 'Attrition' + testDuration = 10000.0 diff --git a/tests/slow/MetaclusterManagement.toml b/tests/slow/MetaclusterManagement.toml index 5e8db254d7..aa5cea6ed7 100644 --- a/tests/slow/MetaclusterManagement.toml +++ b/tests/slow/MetaclusterManagement.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] allowCreatingTenants = false extraDatabaseMode = 'Multiple' extraDatabaseCount = 5 diff --git a/tests/slow/SwizzledTenantManagement.toml b/tests/slow/SwizzledTenantManagement.toml index 8a8731915a..fab8fc2b1c 100644 --- a/tests/slow/SwizzledTenantManagement.toml +++ b/tests/slow/SwizzledTenantManagement.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] [[test]] testTitle = 'TenantManagementTest' diff --git a/tests/slow/SwizzledTenantManagementMetacluster.toml b/tests/slow/SwizzledTenantManagementMetacluster.toml index af3a40d639..c4365ca252 100644 --- a/tests/slow/SwizzledTenantManagementMetacluster.toml +++ b/tests/slow/SwizzledTenantManagementMetacluster.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] allowCreatingTenants = false extraDatabaseMode = 'Single' diff --git a/tests/slow/TenantManagement.toml b/tests/slow/TenantManagement.toml index 023c826153..0bda2aaf0f 100644 --- a/tests/slow/TenantManagement.toml +++ b/tests/slow/TenantManagement.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] allowCreatingTenants = false extraDatabaseMode = 'Single' diff --git a/tests/slow/TenantManagementConcurrency.toml b/tests/slow/TenantManagementConcurrency.toml index 3b04d76586..425c67ac68 100644 --- a/tests/slow/TenantManagementConcurrency.toml +++ b/tests/slow/TenantManagementConcurrency.toml @@ -1,6 +1,6 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false +tenantModes = ['optional', 'required'] allowCreatingTenants = false extraDatabaseMode = 'Single'