diff --git a/.flake8 b/.flake8 index 85f960bc4e..4bdbe2a245 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] -ignore = E203, E266, E501, W503, F403, F401, E711, C901, W605 -max-line-length = 79 +ignore = E203, E266, E501, W503, F403, F401, E711, C901, E721, W605 +max-line-length = 88 max-complexity = 18 select = B,C,E,F,W,T4,B9 \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 86cdb6f702..3652be013c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,6 +110,12 @@ set(FDB_PACKAGE_NAME "${FDB_MAJOR}.${FDB_MINOR}") configure_file(${CMAKE_SOURCE_DIR}/versions.target.cmake ${CMAKE_CURRENT_BINARY_DIR}/versions.target) file(WRITE ${CMAKE_BINARY_DIR}/version.txt ${FDB_VERSION}) +set(FDB_CURRENT_VERSION ${PROJECT_VERSION}) +set(FDB_FUTURE_VERSION "7.4.0") +set(FDB_PREV_RELEASE_VERSION "7.1.25") +set(FDB_PREV2_RELEASE_VERSION "7.0.0") +set(FDB_PREV3_RELEASE_VERSION "6.3.25") + ################################################################################ # Flow ################################################################################ diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index f2afaace1f..60817f73d2 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -154,6 +154,8 @@ class ApiTest(Test): snapshot_reads = [x + '_SNAPSHOT' for x in reads] database_reads = [x + '_DATABASE' for x in reads] database_mutations = [x + '_DATABASE' for x in mutations] + tenant_reads = [x + '_TENANT' for x in reads] + tenant_mutations = [x + '_TENANT' for x in mutations] mutations += ['VERSIONSTAMP'] versions = ['GET_READ_VERSION', 'SET_READ_VERSION', 'GET_COMMITTED_VERSION'] snapshot_versions = ['GET_READ_VERSION_SNAPSHOT'] @@ -183,6 +185,8 @@ class ApiTest(Test): if not args.no_tenants: op_choices += tenants + op_choices += tenant_reads + op_choices += tenant_mutations idempotent_atomic_ops = ['BIT_AND', 'BIT_OR', 'MAX', 'MIN', 'BYTE_MIN', 'BYTE_MAX'] atomic_ops = idempotent_atomic_ops + ['ADD', 'BIT_XOR', 'APPEND_IF_FITS'] diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 01959eb048..760a94e1a4 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -283,7 +283,8 @@ if(NOT WIN32) foreach(test_file ${API_TEST_FILES}) get_filename_component(file_name "${test_file}" NAME_WE) set(test_name "fdb_c_api_test_${file_name}") - add_test(NAME "${test_name}" + add_scripted_fdb_test(NAME "${test_name}" + TIMEOUT 300 COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py --build-dir ${CMAKE_BINARY_DIR} --api-tester-bin $ @@ -291,99 +292,87 @@ if(NOT WIN32) --test-file ${test_file} --retain-client-lib-copies ) - set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300) endforeach() - add_test(NAME fdb_c_upgrade_to_future_version + add_scripted_fdb_test(NAME fdb_c_upgrade_to_future_version COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.3.0" "7.4.0" "7.3.0" + --upgrade-path "${FDB_CURRENT_VERSION}" "${FDB_FUTURE_VERSION}" "${FDB_CURRENT_VERSION}" --process-number 3 ) - set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") - add_test(NAME fdb_c_upgrade_to_future_version_blob_granules + add_scripted_fdb_test(NAME fdb_c_upgrade_to_future_version_blob_granules COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml - --upgrade-path "7.3.0" "7.4.0" "7.3.0" + --upgrade-path "${FDB_CURRENT_VERSION}" "${FDB_FUTURE_VERSION}" "${FDB_CURRENT_VERSION}" --blob-granules-enabled --process-number 3 ) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER) - add_test(NAME fdb_c_client_config_tests + add_scripted_fdb_test(NAME fdb_c_client_config_tests COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_client_config_tests.py --build-dir ${CMAKE_BINARY_DIR} --client-config-tester-bin $ ) - add_test(NAME fdb_c_upgrade_single_threaded_630api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.3.0" - --process-number 1 - ) - - add_test(NAME fdb_c_upgrade_single_threaded_700api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "7.0.0" "7.1.9" "7.3.0" - --process-number 1 - ) - - add_test(NAME fdb_c_upgrade_multi_threaded_630api + add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev3_gradual COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.3.0" "7.1.9" + --upgrade-path "${FDB_PREV3_RELEASE_VERSION}" "${FDB_PREV2_RELEASE_VERSION}" "${FDB_PREV_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}" --process-number 3 ) - add_test(NAME fdb_c_upgrade_multi_threaded_700api + add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev3_direct COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.0.0" "7.1.9" "7.3.0" "7.1.9" + --upgrade-path "${FDB_PREV3_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}" --process-number 3 ) - add_test(NAME fdb_c_upgrade_multi_threaded_710api + add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev2_gradual COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.9" "7.3.0" "7.1.9" + --upgrade-path "${FDB_PREV2_RELEASE_VERSION}" "${FDB_PREV_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}" --process-number 3 ) - add_test(NAME fdb_c_cluster_wiggle + add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev2_direct COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.3.0" "wiggle" + --upgrade-path "${FDB_PREV2_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}" + --process-number 3 + ) + + add_scripted_fdb_test(NAME fdb_c_upgrade_from_prev + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "${FDB_PREV_RELEASE_VERSION}" "${FDB_CURRENT_VERSION}" "${FDB_PREV_RELEASE_VERSION}" + --process-number 3 + ) + + add_scripted_fdb_test(NAME fdb_c_wiggle_only + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "${FDB_CURRENT_VERSION}" "wiggle" --disable-log-dump --process-number 3 --redundancy double ) - add_test(NAME fdb_c_wiggle_and_upgrade_latest + add_scripted_fdb_test(NAME fdb_c_wiggle_and_upgrade COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.9" "wiggle" "7.3.0" - --disable-log-dump - --process-number 3 - --redundancy double - ) - - add_test(NAME fdb_c_wiggle_and_upgrade_63 - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.24" "wiggle" "7.0.0" + --upgrade-path "${FDB_PREV_RELEASE_VERSION}" "wiggle" "${FDB_CURRENT_VERSION}" --disable-log-dump --process-number 3 --redundancy double @@ -470,7 +459,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads) target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include) - add_test(NAME fdb_c_shim_library_tests + add_scripted_fdb_test(NAME fdb_c_shim_library_tests COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py --build-dir ${CMAKE_BINARY_DIR} --unit-tests-bin $ diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml deleted file mode 100644 index daf070b31b..0000000000 --- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml +++ /dev/null @@ -1,43 +0,0 @@ -[[test]] -title = 'Mixed Workload for Upgrade Tests with a Single FDB Thread' -multiThreaded = false -buggify = true -databasePerTransaction = false -minDatabases = 2 -maxDatabases = 8 -minClientThreads = 2 -maxClientThreads = 8 -minClients = 2 -maxClients = 8 - - [[test.workload]] - name = 'ApiCorrectness' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - runUntilStop = true - readExistingKeysRatio = 0.9 - - [[test.workload]] - name = 'CancelTransaction' - minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - runUntilStop = true - readExistingKeysRatio = 0.9 - - [[test.workload]] - name = 'AtomicOpsCorrectness' - initialSize = 0 - runUntilStop = true - - [[test.workload]] - name = 'WatchAndWait' - initialSize = 0 - runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/fdb_c_client_config_tests.py b/bindings/c/test/fdb_c_client_config_tests.py index 845493d425..a8a2a2e846 100644 --- a/bindings/c/test/fdb_c_client_config_tests.py +++ b/bindings/c/test/fdb_c_client_config_tests.py @@ -7,16 +7,9 @@ import sys import os import glob import unittest - -sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "tests", "TestRunner")] - -# fmt: off -from binary_download import FdbBinaryDownloader, CURRENT_VERSION +from fdb_version import CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION +from binary_download import FdbBinaryDownloader from local_cluster import LocalCluster, random_secret_string -# fmt: on - -PREV_RELEASE_VERSION = "7.1.5" -PREV_PREV_RELEASE_VERSION = "7.0.0" args = None downloader = None @@ -180,15 +173,15 @@ class ClientConfigTests(unittest.TestCase): def test_multiple_external_clients(self): # Multiple external clients, normal case test = ClientConfigTest(self) - test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION]) + test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION]) test.disable_local_client = True - test.api_version = api_version_from_str(PREV_PREV_RELEASE_VERSION) + test.api_version = api_version_from_str(PREV2_RELEASE_VERSION) test.exec() def test_no_external_client_support_api_version(self): # Multiple external clients, API version supported by none of them test = ClientConfigTest(self) - test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION]) + test.create_external_lib_dir([PREV2_RELEASE_VERSION, PREV_RELEASE_VERSION]) test.disable_local_client = True test.api_version = api_version_from_str(CURRENT_VERSION) test.expected_error = 2204 # API function missing @@ -197,7 +190,7 @@ class ClientConfigTests(unittest.TestCase): def test_no_external_client_support_api_version_ignore(self): # Multiple external clients; API version supported by none of them; Ignore failures test = ClientConfigTest(self) - test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION]) + test.create_external_lib_dir([PREV2_RELEASE_VERSION, PREV_RELEASE_VERSION]) test.disable_local_client = True test.api_version = api_version_from_str(CURRENT_VERSION) test.ignore_external_client_failures = True @@ -207,7 +200,7 @@ class ClientConfigTests(unittest.TestCase): def test_one_external_client_wrong_api_version(self): # Multiple external clients, API version unsupported by one of othem test = ClientConfigTest(self) - test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION]) + test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION]) test.disable_local_client = True test.api_version = api_version_from_str(CURRENT_VERSION) test.expected_error = 2204 # API function missing @@ -216,7 +209,7 @@ class ClientConfigTests(unittest.TestCase): def test_one_external_client_wrong_api_version_ignore(self): # Multiple external clients; API version unsupported by one of them; Ignore failures test = ClientConfigTest(self) - test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION]) + test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV2_RELEASE_VERSION]) test.disable_local_client = True test.api_version = api_version_from_str(CURRENT_VERSION) test.ignore_external_client_failures = True @@ -286,6 +279,6 @@ if __name__ == "__main__": downloader = FdbBinaryDownloader(args.build_dir) downloader.download_old_binaries(PREV_RELEASE_VERSION) - downloader.download_old_binaries(PREV_PREV_RELEASE_VERSION) + downloader.download_old_binaries(PREV2_RELEASE_VERSION) unittest.main(verbosity=2) diff --git a/bindings/c/test/fdb_c_shim_tests.py b/bindings/c/test/fdb_c_shim_tests.py index 479e06d5f0..5f19d742c4 100644 --- a/bindings/c/test/fdb_c_shim_tests.py +++ b/bindings/c/test/fdb_c_shim_tests.py @@ -6,15 +6,10 @@ import shutil import subprocess import sys import os - -sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..', '..', '..', 'tests', 'TestRunner')] - -# fmt: off -from binary_download import FdbBinaryDownloader, CURRENT_VERSION +from binary_download import FdbBinaryDownloader from local_cluster import LocalCluster, random_secret_string -# fmt: on +from fdb_version import CURRENT_VERSION, PREV_RELEASE_VERSION -LAST_RELEASE_VERSION = "7.1.5" TESTER_STATS_INTERVAL_SEC = 5 DEFAULT_TEST_FILE = "CApiCorrectnessMultiThr.toml" IMPLIBSO_ERROR_CODE = -6 # SIGABORT @@ -54,13 +49,12 @@ class TestEnv(LocalCluster): self.downloader.binary_path(version, "fdbserver"), self.downloader.binary_path(version, "fdbmonitor"), self.downloader.binary_path(version, "fdbcli"), - 1 + 1, ) self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version)) client_lib = self.downloader.lib_path(version) assert client_lib.exists(), "{} does not exist".format(client_lib) - self.client_lib_external = self.tmp_dir.joinpath( - "libfdb_c_external.so") + self.client_lib_external = self.tmp_dir.joinpath("libfdb_c_external.so") shutil.copyfile(client_lib, self.client_lib_external) def __enter__(self): @@ -73,22 +67,16 @@ class TestEnv(LocalCluster): shutil.rmtree(self.tmp_dir) def exec_client_command(self, cmd_args, env_vars=None, expected_ret_code=0): - print("Executing test command: {}".format( - " ".join([str(c) for c in cmd_args]) - )) - tester_proc = subprocess.Popen( - cmd_args, stdout=sys.stdout, stderr=sys.stderr, env=env_vars - ) + print("Executing test command: {}".format(" ".join([str(c) for c in cmd_args]))) + tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr, env=env_vars) tester_retcode = tester_proc.wait() assert tester_retcode == expected_ret_code, "Tester completed return code {}, but {} was expected".format( - tester_retcode, expected_ret_code) + tester_retcode, expected_ret_code + ) class FdbCShimTests: - def __init__( - self, - args - ): + def __init__(self, args): self.build_dir = Path(args.build_dir).resolve() assert self.build_dir.exists(), "{} does not exist".format(args.build_dir) assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir) @@ -97,15 +85,14 @@ class FdbCShimTests: self.api_tester_bin = Path(args.api_tester_bin).resolve() assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tests_bin) self.shim_lib_tester_bin = Path(args.shim_lib_tester_bin).resolve() - assert self.shim_lib_tester_bin.exists( - ), "{} does not exist".format(self.shim_lib_tester_bin) + assert self.shim_lib_tester_bin.exists(), "{} does not exist".format(self.shim_lib_tester_bin) self.api_test_dir = Path(args.api_test_dir).resolve() assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir) self.downloader = FdbBinaryDownloader(args.build_dir) # binary downloads are currently available only for x86_64 self.platform = platform.machine() - if (self.platform == "x86_64"): - self.downloader.download_old_binaries(LAST_RELEASE_VERSION) + if self.platform == "x86_64": + self.downloader.download_old_binaries(PREV_RELEASE_VERSION) self.downloader.download_old_binaries("7.0.0") def build_c_api_tester_args(self, test_env, test_file): @@ -127,34 +114,27 @@ class FdbCShimTests: "--tmp-dir", test_env.tmp_dir, "--stats-interval", - str(TESTER_STATS_INTERVAL_SEC * 1000) + str(TESTER_STATS_INTERVAL_SEC * 1000), ] def run_c_api_test(self, version, test_file): - print('-' * 80) + print("-" * 80) print("C API Test - version: {}, workload: {}".format(version, test_file)) - print('-' * 80) + print("-" * 80) with TestEnv(self.build_dir, self.downloader, version) as test_env: cmd_args = self.build_c_api_tester_args(test_env, test_file) env_vars = os.environ.copy() - env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path( - version) + env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(version) test_env.exec_client_command(cmd_args, env_vars) def run_c_unit_tests(self, version): - print('-' * 80) + print("-" * 80) print("C Unit Tests - version: {}".format(version)) - print('-' * 80) + print("-" * 80) with TestEnv(self.build_dir, self.downloader, version) as test_env: - cmd_args = [ - self.unit_tests_bin, - test_env.cluster_file, - "fdb", - test_env.client_lib_external - ] + cmd_args = [self.unit_tests_bin, test_env.cluster_file, "fdb", test_env.client_lib_external] env_vars = os.environ.copy() - env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path( - version) + env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(version) test_env.exec_client_command(cmd_args, env_vars) def run_c_shim_lib_tester( @@ -167,9 +147,9 @@ class FdbCShimTests: set_env_path=False, set_ld_lib_path=False, use_external_lib=True, - expected_ret_code=0 + expected_ret_code=0, ): - print('-' * 80) + print("-" * 80) if api_version is None: api_version = api_version_from_str(version) test_flags = [] @@ -183,9 +163,8 @@ class FdbCShimTests: test_flags.append("use_external_lib") else: test_flags.append("use_local_lib") - print("C Shim Tests - version: {}, API version: {}, {}".format(version, - api_version, ", ".join(test_flags))) - print('-' * 80) + print("C Shim Tests - version: {}, API version: {}, {}".format(version, api_version, ", ".join(test_flags))) + print("-" * 80) cmd_args = [ self.shim_lib_tester_bin, "--cluster-file", @@ -196,20 +175,16 @@ class FdbCShimTests: if call_set_path: cmd_args = cmd_args + [ "--local-client-library", - ("dummy" if invalid_lib_path else self.downloader.lib_path(version)) + ("dummy" if invalid_lib_path else self.downloader.lib_path(version)), ] if use_external_lib: - cmd_args = cmd_args + [ - "--disable-local-client", - "--external-client-library", - test_env.client_lib_external - ] + cmd_args = cmd_args + ["--disable-local-client", "--external-client-library", test_env.client_lib_external] env_vars = os.environ.copy() - env_vars["LD_LIBRARY_PATH"] = ( - self.downloader.lib_dir(version) if set_ld_lib_path else "") + env_vars["LD_LIBRARY_PATH"] = self.downloader.lib_dir(version) if set_ld_lib_path else "" if set_env_path: env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = ( - "dummy" if invalid_lib_path else self.downloader.lib_path(version)) + "dummy" if invalid_lib_path else self.downloader.lib_path(version) + ) test_env.exec_client_command(cmd_args, env_vars, expected_ret_code) def run_tests(self): @@ -221,50 +196,60 @@ class FdbCShimTests: with TestEnv(self.build_dir, self.downloader, CURRENT_VERSION) as test_env: # Test lookup of the client library over LD_LIBRARY_PATH - self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, set_ld_lib_path=True) + self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, set_ld_lib_path=True) # Test setting the client library path over an API call - self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, call_set_path=True) + self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True) # Test setting the client library path over an environment variable - self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, set_env_path=True) + self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, set_env_path=True) # Test using the loaded client library as the local client - self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, call_set_path=True, use_external_lib=False) + self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True, use_external_lib=False) # Test setting an invalid client library path over an API call self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, call_set_path=True, invalid_lib_path=True, expected_ret_code=IMPLIBSO_ERROR_CODE) + CURRENT_VERSION, + test_env, + call_set_path=True, + invalid_lib_path=True, + expected_ret_code=IMPLIBSO_ERROR_CODE, + ) # Test setting an invalid client library path over an environment variable self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, set_env_path=True, invalid_lib_path=True, expected_ret_code=IMPLIBSO_ERROR_CODE) + CURRENT_VERSION, + test_env, + set_env_path=True, + invalid_lib_path=True, + expected_ret_code=IMPLIBSO_ERROR_CODE, + ) # Test calling a function that exists in the loaded library, but not for the selected API version - self.run_c_shim_lib_tester( - CURRENT_VERSION, test_env, call_set_path=True, api_version=700) + self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True, api_version=700) # binary downloads are currently available only for x86_64 if self.platform == "x86_64": # Test the API workload with the release version - self.run_c_api_test(LAST_RELEASE_VERSION, DEFAULT_TEST_FILE) + self.run_c_api_test(PREV_RELEASE_VERSION, DEFAULT_TEST_FILE) - with TestEnv(self.build_dir, self.downloader, LAST_RELEASE_VERSION) as test_env: + with TestEnv(self.build_dir, self.downloader, PREV_RELEASE_VERSION) as test_env: # Test using the loaded client library as the local client - self.run_c_shim_lib_tester( - LAST_RELEASE_VERSION, test_env, call_set_path=True, use_external_lib=False) + self.run_c_shim_lib_tester(PREV_RELEASE_VERSION, test_env, call_set_path=True, use_external_lib=False) # Test the client library of the release version in combination with the dev API version self.run_c_shim_lib_tester( - LAST_RELEASE_VERSION, test_env, call_set_path=True, api_version=api_version_from_str(CURRENT_VERSION), expected_ret_code=1) + PREV_RELEASE_VERSION, + test_env, + call_set_path=True, + api_version=api_version_from_str(CURRENT_VERSION), + expected_ret_code=1, + ) # Test calling a function that does not exist in the loaded library self.run_c_shim_lib_tester( - "7.0.0", test_env, call_set_path=True, api_version=700, expected_ret_code=IMPLIBSO_ERROR_CODE) + "7.0.0", test_env, call_set_path=True, api_version=700, expected_ret_code=IMPLIBSO_ERROR_CODE + ) if __name__ == "__main__": @@ -285,25 +270,17 @@ if __name__ == "__main__": required=True, ) parser.add_argument( - '--unit-tests-bin', - type=str, - help='Path to the fdb_c_shim_unit_tests executable.', - required=True) + "--unit-tests-bin", type=str, help="Path to the fdb_c_shim_unit_tests executable.", required=True + ) parser.add_argument( - '--api-tester-bin', - type=str, - help='Path to the fdb_c_shim_api_tester executable.', - required=True) + "--api-tester-bin", type=str, help="Path to the fdb_c_shim_api_tester executable.", required=True + ) parser.add_argument( - '--shim-lib-tester-bin', - type=str, - help='Path to the fdb_c_shim_lib_tester executable.', - required=True) + "--shim-lib-tester-bin", type=str, help="Path to the fdb_c_shim_lib_tester executable.", required=True + ) parser.add_argument( - '--api-test-dir', - type=str, - help='Path to a directory with api test definitions.', - required=True) + "--api-test-dir", type=str, help="Path to a directory with api test definitions.", required=True + ) args = parser.parse_args() test = FdbCShimTests(args) test.run_tests() diff --git a/bindings/go/src/fdb/database.go b/bindings/go/src/fdb/database.go index d13e3db551..a4329fb99c 100644 --- a/bindings/go/src/fdb/database.go +++ b/bindings/go/src/fdb/database.go @@ -42,6 +42,8 @@ import ( // usually created and committed automatically by the (Database).Transact // method. type Database struct { + // String reference to the cluster file. + clusterFile string *database } @@ -56,6 +58,16 @@ type DatabaseOptions struct { d *database } +// Close will close the Database and clean up all resources. +// You have to ensure that you're not resuing this database. +func (d *Database) Close() { + // Remove database object from the cached databases + delete(openDatabases, d.clusterFile) + + // Destroy the database + d.destroy() +} + func (opt DatabaseOptions) setOpt(code int, param []byte) error { return setOpt(func(p *C.uint8_t, pl C.int) C.fdb_error_t { return C.fdb_database_set_option(opt.d.ptr, C.FDBDatabaseOption(code), p, pl) @@ -63,6 +75,10 @@ func (opt DatabaseOptions) setOpt(code int, param []byte) error { } func (d *database) destroy() { + if d.ptr == nil { + return + } + C.fdb_database_destroy(d.ptr) } diff --git a/bindings/go/src/fdb/fdb.go b/bindings/go/src/fdb/fdb.go index e308049be0..518ad545e0 100644 --- a/bindings/go/src/fdb/fdb.go +++ b/bindings/go/src/fdb/fdb.go @@ -39,6 +39,7 @@ import ( // Would put this in futures.go but for the documented issue with // exports and functions in preamble // (https://code.google.com/p/go-wiki/wiki/cgo#Global_functions) +// //export unlockMutex func unlockMutex(p unsafe.Pointer) { m := (*sync.Mutex)(p) @@ -337,7 +338,7 @@ func createDatabase(clusterFile string) (Database, error) { db := &database{outdb} runtime.SetFinalizer(db, (*database).destroy) - return Database{db}, nil + return Database{clusterFile, db}, nil } // Deprecated: Use OpenDatabase instead. diff --git a/bindings/go/src/fdb/fdb_test.go b/bindings/go/src/fdb/fdb_test.go index 00b3f41304..a46785730f 100644 --- a/bindings/go/src/fdb/fdb_test.go +++ b/bindings/go/src/fdb/fdb_test.go @@ -48,7 +48,10 @@ func ExampleOpenDefault() { return } - _ = db + // Close the database after usage + defer db.Close() + + // Do work here // Output: } @@ -313,3 +316,30 @@ func ExamplePrintable() { fmt.Println(fdb.Printable([]byte{0, 1, 2, 'a', 'b', 'c', '1', '2', '3', '!', '?', 255})) // Output: \x00\x01\x02abc123!?\xff } + +func TestDatabaseCloseRemovesResources(t *testing.T) { + err := fdb.APIVersion(API_VERSION) + if err != nil { + t.Fatalf("Unable to set API version: %v\n", err) + } + + // OpenDefault opens the database described by the platform-specific default + // cluster file + db, err := fdb.OpenDefault() + if err != nil { + t.Fatalf("Unable to set API version: %v\n", err) + } + + // Close the database after usage + db.Close() + + // Open the same database again, if the database is still in the cache we would return the same object, if not we create a new object with a new pointer + newDB, err := fdb.OpenDefault() + if err != nil { + t.Fatalf("Unable to set API version: %v\n", err) + } + + if db == newDB { + t.Fatalf("Expected a different database object, got: %v and %v\n", db, newDB) + } +} diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py index 930ad35396..39c30848c3 100644 --- a/bindings/python/fdb/__init__.py +++ b/bindings/python/fdb/__init__.py @@ -25,14 +25,14 @@ https://apple.github.io/foundationdb/api-python.html""" def open(*args, **kwargs): - raise RuntimeError('You must call api_version() before using any fdb methods') + raise RuntimeError("You must call api_version() before using any fdb methods") init = open def transactional(*args, **kwargs): - raise RuntimeError('You must call api_version() before using fdb.transactional') + raise RuntimeError("You must call api_version() before using fdb.transactional") def _add_symbols(module, symbols): @@ -41,29 +41,29 @@ def _add_symbols(module, symbols): def is_api_version_selected(): - return '_version' in globals() + return "_version" in globals() def get_api_version(): if is_api_version_selected(): - return globals()['_version'] + return globals()["_version"] else: - raise RuntimeError('API version is not set') + raise RuntimeError("API version is not set") def api_version(ver): header_version = 720 - if '_version' in globals(): - if globals()['_version'] != ver: - raise RuntimeError('FDB API already loaded at version %d' % _version) + if "_version" in globals(): + if globals()["_version"] != ver: + raise RuntimeError("FDB API already loaded at version %d" % _version) return if ver < 13: - raise RuntimeError('FDB API versions before 13 are not supported') + raise RuntimeError("FDB API versions before 13 are not supported") if ver > header_version: - raise RuntimeError('Latest known FDB API version is %d' % header_version) + raise RuntimeError("Latest known FDB API version is %d" % header_version) import fdb.impl @@ -71,31 +71,37 @@ def api_version(ver): if err == 2203: # api_version_not_supported, but that's not helpful to the user max_supported_ver = fdb.impl._capi.fdb_get_max_api_version() if header_version > max_supported_ver: - raise RuntimeError("This version of the FoundationDB Python binding is not supported by the installed " - "FoundationDB C library. The binding requires a library that supports API version " - "%d, but the installed library supports a maximum version of %d." % (header_version, max_supported_ver)) + raise RuntimeError( + "This version of the FoundationDB Python binding is not supported by the installed " + "FoundationDB C library. The binding requires a library that supports API version " + "%d, but the installed library supports a maximum version of %d." + % (header_version, max_supported_ver) + ) else: - raise RuntimeError("API version %d is not supported by the installed FoundationDB C library." % ver) + raise RuntimeError( + "API version %d is not supported by the installed FoundationDB C library." + % ver + ) elif err != 0: - raise RuntimeError('FoundationDB API error') + raise RuntimeError("FoundationDB API error") fdb.impl.init_c_api() list = ( - 'FDBError', - 'predicates', - 'Future', - 'Database', - 'Tenant', - 'Transaction', - 'KeyValue', - 'KeySelector', - 'open', - 'transactional', - 'options', - 'StreamingMode', + "FDBError", + "predicates", + "Future", + "Database", + "Tenant", + "Transaction", + "KeyValue", + "KeySelector", + "open", + "transactional", + "options", + "StreamingMode", ) _add_symbols(fdb.impl, list) @@ -134,14 +140,20 @@ def api_version(ver): if not hasattr(self, "__iterating"): self.__iterating = iter(self) return next(self.__iterating) + setattr(fdb.impl.FDBRange, "next", next) - globals()['_version'] = ver + globals()["_version"] = ver import fdb.directory_impl - directory_symbols = ('directory', 'DirectoryLayer',) + + directory_symbols = ( + "directory", + "DirectoryLayer", + ) _add_symbols(fdb.directory_impl, directory_symbols) import fdb.subspace_impl - subspace_symbols = ('Subspace',) + + subspace_symbols = ("Subspace",) _add_symbols(fdb.subspace_impl, subspace_symbols) diff --git a/bindings/python/fdb/directory_impl.py b/bindings/python/fdb/directory_impl.py index f8c4f8d89d..dac4c703ea 100755 --- a/bindings/python/fdb/directory_impl.py +++ b/bindings/python/fdb/directory_impl.py @@ -35,8 +35,7 @@ class AllocatorTransactionState: self.lock = threading.Lock() -class HighContentionAllocator (object): - +class HighContentionAllocator(object): def __init__(self, subspace): self.counters = subspace[0] self.recent = subspace[1] @@ -45,9 +44,9 @@ class HighContentionAllocator (object): @_impl.transactional def allocate(self, tr): """Returns a byte string that - 1) has never and will never be returned by another call to this - method on the same subspace - 2) is nearly as short as possible given the above + 1) has never and will never be returned by another call to this + method on the same subspace + 2) is nearly as short as possible given the above """ # Get transaction-local state @@ -59,16 +58,23 @@ class HighContentionAllocator (object): tr_state = tr.__fdb_directory_layer_hca_state__ while True: - [start] = [self.counters.unpack(k)[0] for k, _ in tr.snapshot.get_range( - self.counters.range().start, self.counters.range().stop, limit=1, reverse=True)] or [0] + [start] = [ + self.counters.unpack(k)[0] + for k, _ in tr.snapshot.get_range( + self.counters.range().start, + self.counters.range().stop, + limit=1, + reverse=True, + ) + ] or [0] window_advanced = False while True: with tr_state.lock: if window_advanced: - del tr[self.counters: self.counters[start]] + del tr[self.counters : self.counters[start]] tr.options.set_next_write_no_write_conflict_range() - del tr[self.recent: self.recent[start]] + del tr[self.recent : self.recent[start]] # Increment the allocation count for the current window tr.add(self.counters[start], struct.pack(" 0 and latest_counter[0] > start: @@ -121,7 +132,7 @@ class HighContentionAllocator (object): class Directory(object): - def __init__(self, directory_layer, path=(), layer=b''): + def __init__(self, directory_layer, path=(), layer=b""): self._directory_layer = directory_layer self._path = path self._layer = layer @@ -129,7 +140,9 @@ class Directory(object): @_impl.transactional def create_or_open(self, tr, path, layer=None): path = self._tuplify_path(path) - return self._directory_layer.create_or_open(tr, self._partition_subpath(path), layer) + return self._directory_layer.create_or_open( + tr, self._partition_subpath(path), layer + ) @_impl.transactional def open(self, tr, path, layer=None): @@ -139,7 +152,9 @@ class Directory(object): @_impl.transactional def create(self, tr, path, layer=None, prefix=None): path = self._tuplify_path(path) - return self._directory_layer.create(tr, self._partition_subpath(path), layer, prefix) + return self._directory_layer.create( + tr, self._partition_subpath(path), layer, prefix + ) @_impl.transactional def list(self, tr, path=()): @@ -150,7 +165,9 @@ class Directory(object): def move(self, tr, old_path, new_path): old_path = self._tuplify_path(old_path) new_path = self._tuplify_path(new_path) - return self._directory_layer.move(tr, self._partition_subpath(old_path), self._partition_subpath(new_path)) + return self._directory_layer.move( + tr, self._partition_subpath(old_path), self._partition_subpath(new_path) + ) @_impl.transactional def move_to(self, tr, new_absolute_path): @@ -161,25 +178,33 @@ class Directory(object): if partition_path != directory_layer._path: raise ValueError("Cannot move between partitions.") - return directory_layer.move(tr, self._path[partition_len:], new_absolute_path[partition_len:]) + return directory_layer.move( + tr, self._path[partition_len:], new_absolute_path[partition_len:] + ) @_impl.transactional def remove(self, tr, path=()): path = self._tuplify_path(path) directory_layer = self._get_layer_for_path(path) - return directory_layer.remove(tr, self._partition_subpath(path, directory_layer)) + return directory_layer.remove( + tr, self._partition_subpath(path, directory_layer) + ) @_impl.transactional def remove_if_exists(self, tr, path=()): path = self._tuplify_path(path) directory_layer = self._get_layer_for_path(path) - return directory_layer.remove_if_exists(tr, self._partition_subpath(path, directory_layer)) + return directory_layer.remove_if_exists( + tr, self._partition_subpath(path, directory_layer) + ) @_impl.transactional def exists(self, tr, path=()): path = self._tuplify_path(path) directory_layer = self._get_layer_for_path(path) - return directory_layer.exists(tr, self._partition_subpath(path, directory_layer)) + return directory_layer.exists( + tr, self._partition_subpath(path, directory_layer) + ) def get_layer(self): return self._layer @@ -194,7 +219,7 @@ class Directory(object): def _partition_subpath(self, path, directory_layer=None): directory_layer = directory_layer or self._directory_layer - return self._path[len(directory_layer._path):] + path + return self._path[len(directory_layer._path) :] + path # Called by all functions that could operate on this subspace directly (move_to, remove, remove_if_exists, exists) # Subclasses can choose to return a different directory layer to use for the operation if path is in fact () @@ -203,8 +228,12 @@ class Directory(object): class DirectoryLayer(Directory): - - def __init__(self, node_subspace=Subspace(rawPrefix=b'\xfe'), content_subspace=Subspace(), allow_manual_prefixes=False): + def __init__( + self, + node_subspace=Subspace(rawPrefix=b"\xfe"), + content_subspace=Subspace(), + allow_manual_prefixes=False, + ): Directory.__init__(self, self) # If specified, new automatically allocated prefixes will all fall within content_subspace @@ -215,11 +244,11 @@ class DirectoryLayer(Directory): # The root node is the one whose contents are the node subspace self._root_node = self._node_subspace[self._node_subspace.key()] - self._allocator = HighContentionAllocator(self._root_node[b'hca']) + self._allocator = HighContentionAllocator(self._root_node[b"hca"]) @_impl.transactional def create_or_open(self, tr, path, layer=None): - """ Opens the directory with the given path. + """Opens the directory with the given path. If the directory does not exist, it is created (creating parent directories if necessary). @@ -229,12 +258,16 @@ class DirectoryLayer(Directory): """ return self._create_or_open_internal(tr, path, layer) - def _create_or_open_internal(self, tr, path, layer=None, prefix=None, allow_create=True, allow_open=True): + def _create_or_open_internal( + self, tr, path, layer=None, prefix=None, allow_create=True, allow_open=True + ): self._check_version(tr, write_access=False) if prefix is not None and not self._allow_manual_prefixes: if len(self._path) == 0: - raise ValueError("Cannot specify a prefix unless manual prefixes are enabled.") + raise ValueError( + "Cannot specify a prefix unless manual prefixes are enabled." + ) else: raise ValueError("Cannot specify a prefix in a partition.") @@ -248,7 +281,9 @@ class DirectoryLayer(Directory): if existing_node.exists(): if existing_node.is_in_partition(): subpath = existing_node.get_partition_subpath() - return existing_node.get_contents(self)._directory_layer._create_or_open_internal( + return existing_node.get_contents( + self + )._directory_layer._create_or_open_internal( tr, subpath, layer, prefix, allow_create, allow_open ) @@ -256,7 +291,9 @@ class DirectoryLayer(Directory): raise ValueError("The directory already exists.") if layer and existing_node.layer() != layer: - raise ValueError("The directory was created with an incompatible layer.") + raise ValueError( + "The directory was created with an incompatible layer." + ) return existing_node.get_contents(self) @@ -269,16 +306,23 @@ class DirectoryLayer(Directory): prefix = self._content_subspace.key() + self._allocator.allocate(tr) if len(list(tr.get_range_startswith(prefix, limit=1))) > 0: - raise Exception("The database has keys stored at the prefix chosen by the automatic prefix allocator: %r." % prefix) + raise Exception( + "The database has keys stored at the prefix chosen by the automatic prefix allocator: %r." + % prefix + ) if not self._is_prefix_free(tr.snapshot, prefix): - raise Exception("The directory layer has manually allocated prefixes that conflict with the automatic prefix allocator.") + raise Exception( + "The directory layer has manually allocated prefixes that conflict with the automatic prefix allocator." + ) elif not self._is_prefix_free(tr, prefix): raise ValueError("The given prefix is already in use.") if len(path) > 1: - parent_node = self._node_with_prefix(self.create_or_open(tr, path[:-1]).key()) + parent_node = self._node_with_prefix( + self.create_or_open(tr, path[:-1]).key() + ) else: parent_node = self._root_node if not parent_node: @@ -288,15 +332,15 @@ class DirectoryLayer(Directory): node = self._node_with_prefix(prefix) tr[parent_node[self.SUBDIRS][path[-1]]] = prefix if not layer: - layer = b'' + layer = b"" - tr[node[b'layer']] = layer + tr[node[b"layer"]] = layer return self._contents_of_node(node, path, layer) @_impl.transactional def open(self, tr, path, layer=None): - """ Opens the directory with the given path. + """Opens the directory with the given path. An error is raised if the directory does not exist, or if a layer is specified and a different layer was specified when the directory was @@ -321,7 +365,7 @@ class DirectoryLayer(Directory): @_impl.transactional def move_to(self, tr, new_absolute_path): - raise Exception('The root directory cannot be moved.') + raise Exception("The root directory cannot be moved.") @_impl.transactional def move(self, tr, old_path, new_path): @@ -339,8 +383,10 @@ class DirectoryLayer(Directory): old_path = _to_unicode_path(old_path) new_path = _to_unicode_path(new_path) - if old_path == new_path[:len(old_path)]: - raise ValueError("The destination directory cannot be a subdirectory of the source directory.") + if old_path == new_path[: len(old_path)]: + raise ValueError( + "The destination directory cannot be a subdirectory of the source directory." + ) old_node = self._find(tr, old_path).prefetch_metadata(tr) new_node = self._find(tr, new_path).prefetch_metadata(tr) @@ -349,18 +395,30 @@ class DirectoryLayer(Directory): raise ValueError("The source directory does not exist.") if old_node.is_in_partition() or new_node.is_in_partition(): - if not old_node.is_in_partition() or not new_node.is_in_partition() or old_node.path != new_node.path: + if ( + not old_node.is_in_partition() + or not new_node.is_in_partition() + or old_node.path != new_node.path + ): raise ValueError("Cannot move between partitions.") - return new_node.get_contents(self).move(tr, old_node.get_partition_subpath(), new_node.get_partition_subpath()) + return new_node.get_contents(self).move( + tr, old_node.get_partition_subpath(), new_node.get_partition_subpath() + ) if new_node.exists(): - raise ValueError("The destination directory already exists. Remove it first.") + raise ValueError( + "The destination directory already exists. Remove it first." + ) parent_node = self._find(tr, new_path[:-1]) if not parent_node.exists(): - raise ValueError("The parent of the destination directory does not exist. Create it first.") - tr[parent_node.subspace[self.SUBDIRS][new_path[-1]]] = self._node_subspace.unpack(old_node.subspace.key())[0] + raise ValueError( + "The parent of the destination directory does not exist. Create it first." + ) + tr[ + parent_node.subspace[self.SUBDIRS][new_path[-1]] + ] = self._node_subspace.unpack(old_node.subspace.key())[0] self._remove_from_parent(tr, old_path) return self._contents_of_node(old_node.subspace, new_path, old_node.layer()) @@ -400,7 +458,9 @@ class DirectoryLayer(Directory): return False if node.is_in_partition(): - return node.get_contents(self)._directory_layer._remove_internal(tr, node.get_partition_subpath(), fail_on_nonexistent) + return node.get_contents(self)._directory_layer._remove_internal( + tr, node.get_partition_subpath(), fail_on_nonexistent + ) self._remove_recursive(tr, node.subspace) self._remove_from_parent(tr, path) @@ -447,7 +507,7 @@ class DirectoryLayer(Directory): VERSION = (1, 0, 0) def _check_version(self, tr, write_access=True): - version = tr[self._root_node[b'version']] + version = tr[self._root_node[b"version"]] if not version.present(): if write_access: @@ -455,16 +515,22 @@ class DirectoryLayer(Directory): return - version = struct.unpack(' self.VERSION[0]: - raise Exception("Cannot load directory with version %d.%d.%d using directory layer %d.%d.%d" % (version + self.VERSION)) + raise Exception( + "Cannot load directory with version %d.%d.%d using directory layer %d.%d.%d" + % (version + self.VERSION) + ) if version[1] > self.VERSION[1] and write_access: - raise Exception("Directory with version %d.%d.%d is read-only when opened using directory layer %d.%d.%d" % (version + self.VERSION)) + raise Exception( + "Directory with version %d.%d.%d is read-only when opened using directory layer %d.%d.%d" + % (version + self.VERSION) + ) def _initialize_directory(self, tr): - tr[self._root_node[b'version']] = struct.pack(' len(self.path)) + return ( + self.exists() + and self.layer(tr) == b"partition" + and (include_empty_subpath or len(self.target_path) > len(self.path)) + ) def get_partition_subpath(self): - return self.target_path[len(self.path):] + return self.target_path[len(self.path) :] def get_contents(self, directory_layer, tr=None): - return directory_layer._contents_of_node(self.subspace, self.path, self.layer(tr)) + return directory_layer._contents_of_node( + self.subspace, self.path, self.layer(tr) + ) diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index aa967ba25d..37f364eb13 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -32,10 +32,17 @@ import sys import threading import traceback +import weakref import fdb from fdb import six from fdb.tuple import pack, unpack +from fdb import fdboptions as _opts +import types +import struct + +import atexit + _network_thread = None _network_thread_reentrant_lock = threading.RLock() @@ -43,8 +50,6 @@ _open_file = open _thread_local_storage = threading.local() -import weakref - class _NetworkOptions(object): def __init__(self, parent): @@ -66,19 +71,16 @@ class _TransactionOptions(object): self._parent = weakref.proxy(tr) -from fdb import fdboptions as _opts -import types -import struct - - def remove_prefix(text, prefix): if text.startswith(prefix): - return text[len(prefix):] + return text[len(prefix) :] return text + def option_wrap(code): def setfunc(self): self._parent._set_option(code, None, 0) + return setfunc @@ -86,6 +88,7 @@ def option_wrap_string(code): def setfunc(self, param=None): param, length = optionalParamToBytes(param) self._parent._set_option(code, param, length) + return setfunc @@ -96,25 +99,29 @@ def option_wrap_bytes(code): elif isinstance(param, bytes): self._parent._set_option(code, param, len(param)) else: - raise TypeError('Value must be of type ' + bytes.__name__) + raise TypeError("Value must be of type " + bytes.__name__) + return setfunc def option_wrap_int(code): def setfunc(self, param): self._parent._set_option(code, struct.pack(">> import fdb ; fdb.api_version(720) # the code above uses @transactional before the API version is set if fdb.get_api_version() >= 630 and inspect.isgeneratorfunction(func): - raise ValueError("Generators can not be wrapped with fdb.transactional") + raise ValueError( + "Generators can not be wrapped with fdb.transactional" + ) if isinstance(args[index], TransactionRead): return func(*args, **kwargs) @@ -281,7 +302,9 @@ def transactional(*tr_args, **tr_kwargs): try: ret = func(*largs, **kwargs) if fdb.get_api_version() >= 630 and inspect.isgenerator(ret): - raise ValueError("Generators can not be wrapped with fdb.transactional") + raise ValueError( + "Generators can not be wrapped with fdb.transactional" + ) tr.commit().wait() committed = True except FDBError as e: @@ -292,12 +315,13 @@ def transactional(*tr_args, **tr_kwargs): # elapsed = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / float(10**6) # if elapsed >= 1: # td = now - start - # print ('fdb WARNING: long transaction (%gs elapsed in transactional function \'%s\' (%d retries, %s))' - # % (elapsed, func.__name__, retries, committed and 'committed' or 'not yet committed')) + # print ("fdb WARNING: long transaction (%gs elapsed in transactional function \"%s\" (%d retries, %s))" + # % (elapsed, func.__name__, retries, committed and "committed" or "not yet committed")) # last = now # retries += 1 return ret + return wrapper if not tr_args: @@ -308,7 +332,7 @@ def transactional(*tr_args, **tr_kwargs): # Being called as a decorator return decorate(tr_args[0]) else: - raise Exception('Invalid use of transactional decorator.') + raise Exception("Invalid use of transactional decorator.") class FDBError(Exception): @@ -330,10 +354,10 @@ class FDBError(Exception): return self._description def __str__(self): - return '%s (%d)' % (self.description, self.code) + return "%s (%d)" % (self.description, self.code) def __repr__(self): - return 'FDBError(%d)' % self.code + return "FDBError(%d)" % self.code class _FDBBase(object): @@ -358,7 +382,9 @@ class FDBRange(object): self._reverse = reverse self._mode = streaming_mode - self._future = self._tr._get_range(begin, end, limit, streaming_mode, 1, reverse) + self._future = self._tr._get_range( + begin, end, limit, streaming_mode, 1, reverse + ) def to_list(self): if self._mode == StreamingMode.iterator: @@ -406,7 +432,9 @@ class FDBRange(object): esel = KeySelector.first_greater_or_equal(kvs[-1].key) else: bsel = KeySelector.first_greater_than(kvs[-1].key) - future = self._tr._get_range(bsel, esel, limit, mode, iteration, self._reverse) + future = self._tr._get_range( + bsel, esel, limit, mode, iteration, self._reverse + ) yield result @@ -418,7 +446,7 @@ class TransactionRead(_FDBBase): self._snapshot = snapshot def __del__(self): - # print('Destroying transactionread 0x%x' % self.tpointer) + # print("Destroying transactionread 0x%x" % self.tpointer) self.capi.fdb_transaction_destroy(self.tpointer) def get_read_version(self): @@ -427,13 +455,23 @@ class TransactionRead(_FDBBase): def get(self, key): key = keyToBytes(key) - return Value(self.capi.fdb_transaction_get(self.tpointer, key, len(key), self._snapshot)) + return Value( + self.capi.fdb_transaction_get(self.tpointer, key, len(key), self._snapshot) + ) def get_key(self, key_selector): key = keyToBytes(key_selector.key) - return Key(self.capi.fdb_transaction_get_key( - self.tpointer, key, len(key), key_selector.or_equal, key_selector.offset, self._snapshot)) + return Key( + self.capi.fdb_transaction_get_key( + self.tpointer, + key, + len(key), + key_selector.or_equal, + key_selector.offset, + self._snapshot, + ) + ) def _get_range(self, begin, end, limit, streaming_mode, iteration, reverse): beginKey = keyToBytes(begin.key) @@ -441,20 +479,36 @@ class TransactionRead(_FDBBase): return FutureKeyValueArray( self.capi.fdb_transaction_get_range( - self.tpointer, beginKey, len(beginKey), begin.or_equal, begin.offset, - endKey, len(endKey), end.or_equal, end.offset, - limit, 0, streaming_mode, iteration, self._snapshot, reverse)) + self.tpointer, + beginKey, + len(beginKey), + begin.or_equal, + begin.offset, + endKey, + len(endKey), + end.or_equal, + end.offset, + limit, + 0, + streaming_mode, + iteration, + self._snapshot, + reverse, + ) + ) def _to_selector(self, key_or_selector): if not isinstance(key_or_selector, KeySelector): key_or_selector = KeySelector.first_greater_or_equal(key_or_selector) return key_or_selector - def get_range(self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.iterator): + def get_range( + self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.iterator + ): if begin is None: - begin = b'' + begin = b"" if end is None: - end = b'\xff' + end = b"\xff" begin = self._to_selector(begin) end = self._to_selector(end) return FDBRange(self, begin, end, limit, reverse, streaming_mode) @@ -467,36 +521,39 @@ class TransactionRead(_FDBBase): if isinstance(key, slice): return self.get_range(key.start, key.stop, reverse=(key.step == -1)) return self.get(key) - + def get_estimated_range_size_bytes(self, begin_key, end_key): if begin_key is None or end_key is None: if fdb.get_api_version() >= 700: - raise Exception('Invalid begin key or end key') + raise Exception("Invalid begin key or end key") else: if begin_key is None: - begin_key = b'' + begin_key = b"" if end_key is None: - end_key = b'\xff' - return FutureInt64(self.capi.fdb_transaction_get_estimated_range_size_bytes( - self.tpointer, - begin_key, len(begin_key), - end_key, len(end_key) - )) - + end_key = b"\xff" + return FutureInt64( + self.capi.fdb_transaction_get_estimated_range_size_bytes( + self.tpointer, begin_key, len(begin_key), end_key, len(end_key) + ) + ) + def get_range_split_points(self, begin_key, end_key, chunk_size): - if begin_key is None or end_key is None or chunk_size <=0: - raise Exception('Invalid begin key, end key or chunk size') - return FutureKeyArray(self.capi.fdb_transaction_get_range_split_points( - self.tpointer, - begin_key, len(begin_key), - end_key, len(end_key), - chunk_size - )) + if begin_key is None or end_key is None or chunk_size <= 0: + raise Exception("Invalid begin key, end key or chunk size") + return FutureKeyArray( + self.capi.fdb_transaction_get_range_split_points( + self.tpointer, + begin_key, + len(begin_key), + end_key, + len(end_key), + chunk_size, + ) + ) + class Transaction(TransactionRead): - """A modifiable snapshot of a Database. - - """ + """A modifiable snapshot of a Database.""" def __init__(self, tpointer, db): super(Transaction, self).__init__(tpointer, db, False) @@ -518,7 +575,9 @@ class Transaction(TransactionRead): paramLength = len(paramBytes) keyBytes = keyToBytes(key) keyLength = len(keyBytes) - self.capi.fdb_transaction_atomic_op(self.tpointer, keyBytes, keyLength, paramBytes, paramLength, opcode) + self.capi.fdb_transaction_atomic_op( + self.tpointer, keyBytes, keyLength, paramBytes, paramLength, opcode + ) def set(self, key, value): key = keyToBytes(key) @@ -535,9 +594,9 @@ class Transaction(TransactionRead): def clear_range(self, begin, end): if begin is None: - begin = b'' + begin = b"" if end is None: - end = b'\xff' + end = b"\xff" if isinstance(begin, KeySelector): begin = self.get_key(begin) if isinstance(end, KeySelector): @@ -546,8 +605,9 @@ class Transaction(TransactionRead): begin = keyToBytes(begin) end = keyToBytes(end) - self.capi.fdb_transaction_clear_range(self.tpointer, begin, len(begin), - end, len(end)) + self.capi.fdb_transaction_clear_range( + self.tpointer, begin, len(begin), end, len(end) + ) def clear_range_startswith(self, prefix): prefix = keyToBytes(prefix) @@ -560,32 +620,40 @@ class Transaction(TransactionRead): def add_read_conflict_range(self, begin, end): begin = keyToBytes(begin) end = keyToBytes(end) - self.capi.fdb_transaction_add_conflict_range(self.tpointer, begin, len(begin), end, len(end), ConflictRangeType.read) + self.capi.fdb_transaction_add_conflict_range( + self.tpointer, begin, len(begin), end, len(end), ConflictRangeType.read + ) def add_read_conflict_key(self, key): key = keyToBytes(key) - self.add_read_conflict_range(key, key + b'\x00') + self.add_read_conflict_range(key, key + b"\x00") def add_write_conflict_range(self, begin, end): begin = keyToBytes(begin) end = keyToBytes(end) - self.capi.fdb_transaction_add_conflict_range(self.tpointer, begin, len(begin), end, len(end), ConflictRangeType.write) + self.capi.fdb_transaction_add_conflict_range( + self.tpointer, begin, len(begin), end, len(end), ConflictRangeType.write + ) def add_write_conflict_key(self, key): key = keyToBytes(key) - self.add_write_conflict_range(key, key + b'\x00') + self.add_write_conflict_range(key, key + b"\x00") def commit(self): return FutureVoid(self.capi.fdb_transaction_commit(self.tpointer)) def get_committed_version(self): version = ctypes.c_int64() - self.capi.fdb_transaction_get_committed_version(self.tpointer, ctypes.byref(version)) + self.capi.fdb_transaction_get_committed_version( + self.tpointer, ctypes.byref(version) + ) return version.value def get_approximate_size(self): """Get the approximate commit size of the transaction.""" - return FutureInt64(self.capi.fdb_transaction_get_approximate_size(self.tpointer)) + return FutureInt64( + self.capi.fdb_transaction_get_approximate_size(self.tpointer) + ) def get_versionstamp(self): return Key(self.capi.fdb_transaction_get_versionstamp(self.tpointer)) @@ -620,12 +688,12 @@ class Future(_FDBBase): _state = None # < Hack for trollius def __init__(self, fpointer): - # print('Creating future 0x%x' % fpointer) + # print("Creating future 0x%x" % fpointer) self.fpointer = fpointer def __del__(self): if self.fpointer: - # print('Destroying future 0x%x' % self.fpointer) + # print("Destroying future 0x%x" % self.fpointer) self.capi.fdb_future_destroy(self.fpointer) self.fpointer = None @@ -647,9 +715,9 @@ class Future(_FDBBase): if not self.is_ready(): # Blocking in the native client from the main thread prevents Python from handling signals. # To avoid that behavior, we implement the blocking in Python using semaphores and on_ready. - # Using a Semaphore is faster than an Event, and we create only one per thread to avoid the + # Using a Semaphore is faster than an Event, and we create only one per thread to avoid the # cost of creating one every time. - semaphore = getattr(_thread_local_storage, 'future_block_semaphore', None) + semaphore = getattr(_thread_local_storage, "future_block_semaphore", None) if semaphore is None: semaphore = multiprocessing.Semaphore(0) _thread_local_storage.future_block_semaphore = semaphore @@ -658,10 +726,12 @@ class Future(_FDBBase): try: semaphore.acquire() - except: + except Exception: # If this semaphore didn't actually get released, then we need to replace our thread-local # copy so that later callers still function correctly - _thread_local_storage.future_block_semaphore = multiprocessing.Semaphore(0) + _thread_local_storage.future_block_semaphore = ( + multiprocessing.Semaphore(0) + ) raise def on_ready(self, callback): @@ -670,12 +740,15 @@ class Future(_FDBBase): del cbfunc[:] try: callback(self) - except: + except Exception: try: - sys.stderr.write("Discarding uncaught exception from user FDB callback:\n") + sys.stderr.write( + "Discarding uncaught exception from user FDB callback:\n" + ) traceback.print_exception(*sys.exc_info(), file=sys.stderr) - except: + except Exception: pass + cbfunc = [_CBFUNC(cb_and_delref)] del cb_and_delref _pin_callback(cbfunc[0]) @@ -690,19 +763,22 @@ class Future(_FDBBase): d = {} ev = futures[0].Event() for i, f in enumerate(futures): + def cb(ignore, i=i): - if d.setdefault('i', i) == i: + if d.setdefault("i", i) == i: ev.set() + f.on_ready(cb) ev.wait() - return d['i'] + return d["i"] # asyncio future protocol def cancelled(self): if not self.done(): return False e = self.exception() - return getattr(e, 'code', 0) == 1101 + return getattr(e, "code", 0) == 1101 + done = is_ready def result(self): @@ -740,6 +816,7 @@ class FutureInt64(Future): self.capi.fdb_future_get_int64(self.fpointer, ctypes.byref(value)) return value.value + class FutureUInt64(Future): def wait(self): self.block_until_ready() @@ -747,28 +824,43 @@ class FutureUInt64(Future): self.capi.fdb_future_get_uint64(self.fpointer, ctypes.byref(value)) return value.value + class FutureKeyValueArray(Future): def wait(self): self.block_until_ready() kvs = ctypes.pointer(KeyValueStruct()) count = ctypes.c_int() more = ctypes.c_int() - self.capi.fdb_future_get_keyvalue_array(self.fpointer, ctypes.byref(kvs), ctypes.byref(count), ctypes.byref(more)) - return ([KeyValue(ctypes.string_at(x.key, x.key_length), ctypes.string_at(x.value, x.value_length)) - for x in kvs[0:count.value]], count.value, more.value) + self.capi.fdb_future_get_keyvalue_array( + self.fpointer, ctypes.byref(kvs), ctypes.byref(count), ctypes.byref(more) + ) + return ( + [ + KeyValue( + ctypes.string_at(x.key, x.key_length), + ctypes.string_at(x.value, x.value_length), + ) + for x in kvs[0 : count.value] + ], + count.value, + more.value, + ) # Logically, we should self._release_memory() after extracting the # KVs but before returning, but then we would have to store # the KVs on the python side and in most cases we are about to # destroy the future anyway + class FutureKeyArray(Future): def wait(self): self.block_until_ready() ks = ctypes.pointer(KeyStruct()) count = ctypes.c_int() - self.capi.fdb_future_get_key_array(self.fpointer, ctypes.byref(ks), ctypes.byref(count)) - return [ctypes.string_at(x.key, x.key_length) for x in ks[0:count.value]] + self.capi.fdb_future_get_key_array( + self.fpointer, ctypes.byref(ks), ctypes.byref(count) + ) + return [ctypes.string_at(x.key, x.key_length) for x in ks[0 : count.value]] class FutureStringArray(Future): @@ -776,8 +868,10 @@ class FutureStringArray(Future): self.block_until_ready() strings = ctypes.pointer(ctypes.c_char_p()) count = ctypes.c_int() - self.capi.fdb_future_get_string_array(self.fpointer, ctypes.byref(strings), ctypes.byref(count)) - return list(strings[0:count.value]) + self.capi.fdb_future_get_string_array( + self.fpointer, ctypes.byref(strings), ctypes.byref(count) + ) + return list(strings[0 : count.value]) class replaceable_property(object): @@ -807,9 +901,11 @@ class LazyFuture(Future): self._getter() self._release_memory() - except: + except Exception: e = sys.exc_info() - if not (isinstance(e[1], FDBError) and e[1].code == 1102): # future_released + if not ( + isinstance(e[1], FDBError) and e[1].code == 1102 + ): # future_released raise return self.value @@ -827,6 +923,7 @@ class FutureString(LazyFuture): def getclass(self): return bytes + __class__ = property(getclass) def as_foundationdb_key(self): @@ -898,11 +995,17 @@ class FutureString(LazyFuture): def makewrapper(func): def tmpfunc(self, *args): return func(self.value, *args) + return tmpfunc for i in dir(bytes): - if not i.startswith('_') or i in ('__getitem__', '__getslice__', '__hash__', '__len__'): + if not i.startswith("_") or i in ( + "__getitem__", + "__getslice__", + "__hash__", + "__len__", + ): setattr(FutureString, i, makewrapper(getattr(bytes, i))) @@ -911,8 +1014,12 @@ class Value(FutureString): present = ctypes.c_int() value = ctypes.pointer(ctypes.c_byte()) value_length = ctypes.c_int() - self.capi.fdb_future_get_value(self.fpointer, ctypes.byref(present), - ctypes.byref(value), ctypes.byref(value_length)) + self.capi.fdb_future_get_value( + self.fpointer, + ctypes.byref(present), + ctypes.byref(value), + ctypes.byref(value_length), + ) if present.value: self.value = ctypes.string_at(value, value_length.value) else: @@ -926,7 +1033,9 @@ class Key(FutureString): def _getter(self): key = ctypes.pointer(ctypes.c_byte()) key_length = ctypes.c_int() - self.capi.fdb_future_get_key(self.fpointer, ctypes.byref(key), ctypes.byref(key_length)) + self.capi.fdb_future_get_key( + self.fpointer, ctypes.byref(key), ctypes.byref(key_length) + ) self.value = ctypes.string_at(key, key_length.value) @@ -943,13 +1052,16 @@ class FormerFuture(_FDBBase): def on_ready(self, callback): try: callback(self) - except: + except Exception: try: - sys.stderr.write("Discarding uncaught exception from user FDB callback:\n") + sys.stderr.write( + "Discarding uncaught exception from user FDB callback:\n" + ) traceback.print_exception(*sys.exc_info(), file=sys.stderr) - except: + except Exception: pass + class _TransactionCreator(_FDBBase): def get(self, key): return _TransactionCreator.__creator_getitem(self, key) @@ -962,11 +1074,17 @@ class _TransactionCreator(_FDBBase): def get_key(self, key_selector): return _TransactionCreator.__creator_get_key(self, key_selector) - def get_range(self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.want_all): - return _TransactionCreator.__creator_get_range(self, begin, end, limit, reverse, streaming_mode) + def get_range( + self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.want_all + ): + return _TransactionCreator.__creator_get_range( + self, begin, end, limit, reverse, streaming_mode + ) def get_range_startswith(self, prefix, *args, **kwargs): - return _TransactionCreator.__creator_get_range_startswith(self, prefix, *args, **kwargs) + return _TransactionCreator.__creator_get_range_startswith( + self, prefix, *args, **kwargs + ) def set(self, key, value): _TransactionCreator.__creator_setitem(self, key, value) @@ -1087,13 +1205,27 @@ class _TransactionCreator(_FDBBase): @transactional @coroutine def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode): - raise Return((yield From(tr.get_range(begin, end, limit, reverse, streaming_mode).to_list()))) + raise Return( + ( + yield From( + tr.get_range( + begin, end, limit, reverse, streaming_mode + ).to_list() + ) + ) + ) @staticmethod @transactional @coroutine def __creator_get_range_startswith(tr, prefix, *args, **kwargs): - raise Return((yield From(tr.get_range_startswith(prefix, *args, **kwargs).to_list()))) + raise Return( + ( + yield From( + tr.get_range_startswith(prefix, *args, **kwargs).to_list() + ) + ) + ) @staticmethod @transactional @@ -1150,15 +1282,23 @@ class _TransactionCreator(_FDBBase): tr._atomic_operation(opcode, key, param) raise Return() yield None + return TransactionCreator + def process_tenant_name(name): if isinstance(name, tuple): return pack(name) elif isinstance(name, bytes): return name else: - raise TypeError('Tenant name must be of type ' + bytes.__name__ + ' or of type ' + tuple.__name__) + raise TypeError( + "Tenant name must be of type " + + bytes.__name__ + + " or of type " + + tuple.__name__ + ) + class Database(_TransactionCreator): def __init__(self, dpointer): @@ -1166,7 +1306,7 @@ class Database(_TransactionCreator): self.options = _DatabaseOptions(self) def __del__(self): - # print('Destroying database 0x%x' % self.dpointer) + # print("Destroying database 0x%x" % self.dpointer) self.capi.fdb_database_destroy(self.dpointer) def _set_option(self, option, param, length): @@ -1175,7 +1315,9 @@ class Database(_TransactionCreator): def open_tenant(self, name): tname = process_tenant_name(name) pointer = ctypes.c_void_p() - self.capi.fdb_database_open_tenant(self.dpointer, tname, len(tname), ctypes.byref(pointer)) + self.capi.fdb_database_open_tenant( + self.dpointer, tname, len(tname), ctypes.byref(pointer) + ) return Tenant(pointer.value) def create_transaction(self): @@ -1206,17 +1348,20 @@ class Cluster(_FDBBase): self.options = None def open_database(self, name): - if name != b'DB': - raise FDBError(2013) # invalid_database_name + if name != b"DB": + raise FDBError(2013) # invalid_database_name return create_database(self.cluster_file) def create_database(cluster_file=None): pointer = ctypes.c_void_p() - _FDBBase.capi.fdb_create_database(optionalParamToBytes(cluster_file)[0], ctypes.byref(pointer)) + _FDBBase.capi.fdb_create_database( + optionalParamToBytes(cluster_file)[0], ctypes.byref(pointer) + ) return Database(pointer) + def create_cluster(cluster_file=None): return Cluster(cluster_file) @@ -1250,7 +1395,7 @@ class KeySelector(object): return cls(key, False, 1) def __repr__(self): - return 'KeySelector(%r, %r, %r)' % (self.key, self.or_equal, self.offset) + return "KeySelector(%r, %r, %r)" % (self.key, self.or_equal, self.offset) class KVIter(object): @@ -1275,15 +1420,17 @@ class KVIter(object): class KeyValueStruct(ctypes.Structure): - _fields_ = [('key', ctypes.POINTER(ctypes.c_byte)), - ('key_length', ctypes.c_int), - ('value', ctypes.POINTER(ctypes.c_byte)), - ('value_length', ctypes.c_int)] + _fields_ = [ + ("key", ctypes.POINTER(ctypes.c_byte)), + ("key_length", ctypes.c_int), + ("value", ctypes.POINTER(ctypes.c_byte)), + ("value_length", ctypes.c_int), + ] _pack_ = 4 + class KeyStruct(ctypes.Structure): - _fields_ = [('key', ctypes.POINTER(ctypes.c_byte)), - ('key_length', ctypes.c_int)] + _fields_ = [("key", ctypes.POINTER(ctypes.c_byte)), ("key_length", ctypes.c_int)] _pack_ = 4 @@ -1293,7 +1440,7 @@ class KeyValue(object): self.value = value def __repr__(self): - return '%s: %s' % (repr(self.key), repr(self.value)) + return "%s: %s" % (repr(self.key), repr(self.value)) def __iter__(self): return KVIter(self) @@ -1305,26 +1452,29 @@ def check_error_code(code, func, arguments): return None -if sys.maxsize <= 2**32: +if sys.maxsize <= 2 ** 32: raise Exception("FoundationDB API requires a 64-bit python interpreter!") -if platform.system() == 'Windows': - capi_name = 'fdb_c.dll' -elif platform.system() == 'Linux': - capi_name = 'libfdb_c.so' -elif platform.system() == 'FreeBSD': - capi_name = 'libfdb_c.so' -elif platform.system() == 'Darwin': - capi_name = 'libfdb_c.dylib' -elif sys.platform == 'win32': - capi_name = 'fdb_c.dll' -elif sys.platform.startswith('cygwin'): - capi_name = 'fdb_c.dll' -elif sys.platform.startswith('linux'): - capi_name = 'libfdb_c.so' -elif sys.platform == 'darwin': - capi_name = 'libfdb_c.dylib' +if platform.system() == "Windows": + capi_name = "fdb_c.dll" +elif platform.system() == "Linux": + capi_name = "libfdb_c.so" +elif platform.system() == "FreeBSD": + capi_name = "libfdb_c.so" +elif platform.system() == "Darwin": + capi_name = "libfdb_c.dylib" +elif sys.platform == "win32": + capi_name = "fdb_c.dll" +elif sys.platform.startswith("cygwin"): + capi_name = "fdb_c.dll" +elif sys.platform.startswith("linux"): + capi_name = "libfdb_c.so" +elif sys.platform == "darwin": + capi_name = "libfdb_c.dylib" else: - raise Exception("Platform (%s) %s is not supported by the FoundationDB API!" % (sys.platform, platform.system())) + raise Exception( + "Platform (%s) %s is not supported by the FoundationDB API!" + % (sys.platform, platform.system()) + ) this_dir = os.path.dirname(__file__) @@ -1335,19 +1485,19 @@ this_dir = os.path.dirname(__file__) # Failing that, we try to load the C API library without qualification, and # the library should be on the platform's dynamic library search path def read_pth_file(): - pth_file = os.path.join(this_dir, capi_name + '.pth') + pth_file = os.path.join(this_dir, capi_name + ".pth") if not os.path.exists(pth_file): return None pth = _open_file(pth_file, "rt").read().strip() - if pth[0] != '/': + if pth[0] != "/": pth = os.path.join(this_dir, pth) return pth for pth in [ lambda: os.path.join(this_dir, capi_name), - # lambda: os.path.join(this_dir, '../../lib', capi_name), # For compatibility with existing unix installation process... should be removed - read_pth_file + # lambda: os.path.join(this_dir, "../../lib", capi_name), # For compatibility with existing unix installation process... should be removed + read_pth_file, ]: p = pth() if p and os.path.exists(p): @@ -1356,42 +1506,42 @@ for pth in [ else: try: _capi = ctypes.CDLL(capi_name) - except: + except Exception: # The system python on OS X can't find the library installed to /usr/local/lib if SIP is enabled # find_library does find the location in /usr/local/lib, so if the above fails fallback to using it lib_path = ctypes.util.find_library("fdb_c") if lib_path is not None: try: _capi = ctypes.CDLL(lib_path) - except: + except Exception: raise Exception("Unable to locate the FoundationDB API shared library!") else: raise Exception("Unable to locate the FoundationDB API shared library!") def keyToBytes(k): - if hasattr(k, 'as_foundationdb_key'): + if hasattr(k, "as_foundationdb_key"): k = k.as_foundationdb_key() if not isinstance(k, bytes): - raise TypeError('Key must be of type ' + bytes.__name__) + raise TypeError("Key must be of type " + bytes.__name__) return k def valueToBytes(v): - if hasattr(v, 'as_foundationdb_value'): + if hasattr(v, "as_foundationdb_value"): v = v.as_foundationdb_value() if not isinstance(v, bytes): - raise TypeError('Value must be of type ' + bytes.__name__) + raise TypeError("Value must be of type " + bytes.__name__) return v def paramToBytes(v): if isinstance(v, FutureString): v = v.value - if not isinstance(v, bytes) and hasattr(v, 'encode'): - v = v.encode('utf8') + if not isinstance(v, bytes) and hasattr(v, "encode"): + v = v.encode("utf8") if not isinstance(v, bytes): - raise TypeError('Parameter must be a string') + raise TypeError("Parameter must be a string") return v @@ -1406,6 +1556,7 @@ def optionalParamToBytes(v): _FDBBase.capi = _capi _CBFUNC = ctypes.CFUNCTYPE(None, ctypes.c_void_p) + def init_c_api(): _capi.fdb_select_api_version_impl.argtypes = [ctypes.c_int, ctypes.c_int] _capi.fdb_select_api_version_impl.restype = ctypes.c_int @@ -1420,7 +1571,11 @@ def init_c_api(): _capi.fdb_setup_network.restype = ctypes.c_int _capi.fdb_setup_network.errcheck = check_error_code - _capi.fdb_network_set_option.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_network_set_option.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_network_set_option.restype = ctypes.c_int _capi.fdb_network_set_option.errcheck = check_error_code @@ -1448,7 +1603,11 @@ def init_c_api(): _capi.fdb_future_is_ready.argtypes = [ctypes.c_void_p] _capi.fdb_future_is_ready.restype = ctypes.c_int - _capi.fdb_future_set_callback.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] + _capi.fdb_future_set_callback.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ] _capi.fdb_future_set_callback.restype = int _capi.fdb_future_set_callback.errcheck = check_error_code @@ -1456,61 +1615,104 @@ def init_c_api(): _capi.fdb_future_get_error.restype = int _capi.fdb_future_get_error.errcheck = check_error_code - _capi.fdb_future_get_int64.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int64)] + _capi.fdb_future_get_int64.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int64), + ] _capi.fdb_future_get_int64.restype = ctypes.c_int _capi.fdb_future_get_int64.errcheck = check_error_code - _capi.fdb_future_get_uint64.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_uint64)] + _capi.fdb_future_get_uint64.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_uint64), + ] _capi.fdb_future_get_uint64.restype = ctypes.c_uint _capi.fdb_future_get_uint64.errcheck = check_error_code - _capi.fdb_future_get_key.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.POINTER(ctypes.c_byte)), - ctypes.POINTER(ctypes.c_int)] + _capi.fdb_future_get_key.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.POINTER(ctypes.c_byte)), + ctypes.POINTER(ctypes.c_int), + ] _capi.fdb_future_get_key.restype = ctypes.c_int _capi.fdb_future_get_key.errcheck = check_error_code - _capi.fdb_future_get_value.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), - ctypes.POINTER(ctypes.POINTER(ctypes.c_byte)), ctypes.POINTER(ctypes.c_int)] + _capi.fdb_future_get_value.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.POINTER(ctypes.c_byte)), + ctypes.POINTER(ctypes.c_int), + ] _capi.fdb_future_get_value.restype = ctypes.c_int _capi.fdb_future_get_value.errcheck = check_error_code - _capi.fdb_future_get_keyvalue_array.argtypes = [ctypes.c_void_p, ctypes.POINTER( - ctypes.POINTER(KeyValueStruct)), ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int)] + _capi.fdb_future_get_keyvalue_array.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.POINTER(KeyValueStruct)), + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), + ] _capi.fdb_future_get_keyvalue_array.restype = int _capi.fdb_future_get_keyvalue_array.errcheck = check_error_code - _capi.fdb_future_get_key_array.argtypes = [ctypes.c_void_p, ctypes.POINTER( - ctypes.POINTER(KeyStruct)), ctypes.POINTER(ctypes.c_int)] + _capi.fdb_future_get_key_array.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.POINTER(KeyStruct)), + ctypes.POINTER(ctypes.c_int), + ] _capi.fdb_future_get_key_array.restype = int _capi.fdb_future_get_key_array.errcheck = check_error_code - _capi.fdb_future_get_string_array.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.POINTER(ctypes.c_char_p)), ctypes.POINTER(ctypes.c_int)] + _capi.fdb_future_get_string_array.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.POINTER(ctypes.c_char_p)), + ctypes.POINTER(ctypes.c_int), + ] _capi.fdb_future_get_string_array.restype = int _capi.fdb_future_get_string_array.errcheck = check_error_code - _capi.fdb_create_database.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_void_p)] + _capi.fdb_create_database.argtypes = [ + ctypes.c_char_p, + ctypes.POINTER(ctypes.c_void_p), + ] _capi.fdb_create_database.restype = ctypes.c_int _capi.fdb_create_database.errcheck = check_error_code _capi.fdb_database_destroy.argtypes = [ctypes.c_void_p] _capi.fdb_database_destroy.restype = None - _capi.fdb_database_open_tenant.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p)] + _capi.fdb_database_open_tenant.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.POINTER(ctypes.c_void_p), + ] _capi.fdb_database_open_tenant.restype = ctypes.c_int _capi.fdb_database_open_tenant.errcheck = check_error_code - _capi.fdb_database_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)] + _capi.fdb_database_create_transaction.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_void_p), + ] _capi.fdb_database_create_transaction.restype = ctypes.c_int _capi.fdb_database_create_transaction.errcheck = check_error_code - _capi.fdb_database_set_option.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_database_set_option.argtypes = [ + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_database_set_option.restype = ctypes.c_int _capi.fdb_database_set_option.errcheck = check_error_code _capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p] _capi.fdb_tenant_destroy.restype = None - _capi.fdb_tenant_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)] + _capi.fdb_tenant_create_transaction.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_void_p), + ] _capi.fdb_tenant_create_transaction.restype = ctypes.c_int _capi.fdb_tenant_create_transaction.errcheck = check_error_code @@ -1526,53 +1728,138 @@ def init_c_api(): _capi.fdb_transaction_get_read_version.argtypes = [ctypes.c_void_p] _capi.fdb_transaction_get_read_version.restype = ctypes.c_void_p - _capi.fdb_transaction_get.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] + _capi.fdb_transaction_get.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ] _capi.fdb_transaction_get.restype = ctypes.c_void_p - _capi.fdb_transaction_get_key.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] + _capi.fdb_transaction_get_key.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ] _capi.fdb_transaction_get_key.restype = ctypes.c_void_p - _capi.fdb_transaction_get_range.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, - ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, - ctypes.c_int, ctypes.c_int] + _capi.fdb_transaction_get_range.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ] _capi.fdb_transaction_get_range.restype = ctypes.c_void_p - _capi.fdb_transaction_get_estimated_range_size_bytes.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_get_estimated_range_size_bytes.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_get_estimated_range_size_bytes.restype = ctypes.c_void_p - _capi.fdb_transaction_get_range_split_points.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] + _capi.fdb_transaction_get_range_split_points.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ] _capi.fdb_transaction_get_range_split_points.restype = ctypes.c_void_p - _capi.fdb_transaction_add_conflict_range.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] + _capi.fdb_transaction_add_conflict_range.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ] _capi.fdb_transaction_add_conflict_range.restype = ctypes.c_int _capi.fdb_transaction_add_conflict_range.errcheck = check_error_code - _capi.fdb_transaction_get_addresses_for_key.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_get_addresses_for_key.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_get_addresses_for_key.restype = ctypes.c_void_p - _capi.fdb_transaction_set_option.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_set_option.argtypes = [ + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_set_option.restype = ctypes.c_int _capi.fdb_transaction_set_option.errcheck = check_error_code - _capi.fdb_transaction_atomic_op.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] + _capi.fdb_transaction_atomic_op.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ] _capi.fdb_transaction_atomic_op.restype = None - _capi.fdb_transaction_set.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_set.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_set.restype = None - _capi.fdb_transaction_clear.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_clear.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_clear.restype = None - _capi.fdb_transaction_clear_range.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_clear_range.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_clear_range.restype = None - _capi.fdb_transaction_watch.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] + _capi.fdb_transaction_watch.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ] _capi.fdb_transaction_watch.restype = ctypes.c_void_p _capi.fdb_transaction_commit.argtypes = [ctypes.c_void_p] _capi.fdb_transaction_commit.restype = ctypes.c_void_p - _capi.fdb_transaction_get_committed_version.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int64)] + _capi.fdb_transaction_get_committed_version.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int64), + ] _capi.fdb_transaction_get_committed_version.restype = ctypes.c_int _capi.fdb_transaction_get_committed_version.errcheck = check_error_code @@ -1588,12 +1875,16 @@ def init_c_api(): _capi.fdb_transaction_reset.argtypes = [ctypes.c_void_p] _capi.fdb_transaction_reset.restype = None -if hasattr(ctypes.pythonapi, 'Py_IncRef'): + +if hasattr(ctypes.pythonapi, "Py_IncRef"): + def _pin_callback(cb): ctypes.pythonapi.Py_IncRef(ctypes.py_object(cb)) def _unpin_callback(cb): ctypes.pythonapi.Py_DecRef(ctypes.py_object(cb)) + + else: _active_callbacks = set() _pin_callback = _active_callbacks.add @@ -1606,7 +1897,7 @@ def init(event_model=None): Consider using open() as a higher-level interface. Keyword arguments: - event_model -- the event model to support (default None, also 'gevent') + event_model -- the event model to support (default None, also "gevent") """ with _network_thread_reentrant_lock: @@ -1618,13 +1909,16 @@ def init(event_model=None): raise FDBError(2000) try: + class NetworkThread(threading.Thread): def run(self): try: _capi.fdb_run_network() except FDBError as e: - sys.stderr.write('Unhandled error in FoundationDB network thread: %s\n' % e) - # print('Network stopped') + sys.stderr.write( + "Unhandled error in FoundationDB network thread: %s\n" % e + ) + # print("Network stopped") _network_thread = NetworkThread() _network_thread.daemon = True @@ -1632,20 +1926,24 @@ def init(event_model=None): _network_thread.name = "fdb-network-thread" if event_model is not None: - if event_model == 'gevent': + if event_model == "gevent": import gevent - if gevent.__version__[0] != '0': + if gevent.__version__[0] != "0": + def nullf(): pass class ThreadEvent(object): - has_async_ = hasattr(gevent.get_hub().loop, 'async_') + has_async_ = hasattr(gevent.get_hub().loop, "async_") + def __init__(self): if ThreadEvent.has_async_: self.gevent_async = gevent.get_hub().loop.async_() else: - self.gevent_async = getattr(gevent.get_hub().loop, 'async')() + self.gevent_async = getattr( + gevent.get_hub().loop, "async" + )() self.gevent_async.start(nullf) @@ -1654,10 +1952,13 @@ def init(event_model=None): def wait(self): gevent.get_hub().wait(self.gevent_async) + else: # gevent 0.x doesn't have async, so use a pipe. This doesn't work on Windows. - if platform.system() == 'Windows': - raise Exception("The 'gevent' event_model requires gevent 1.0 on Windows.") + if platform.system() == "Windows": + raise Exception( + "The 'gevent' event_model requires gevent 1.0 on Windows." + ) import gevent.socket @@ -1666,7 +1967,7 @@ def init(event_model=None): self.pair = os.pipe() def set(self): - os.write(self.pair[1], '!') + os.write(self.pair[1], "!") def wait(self): gevent.socket.wait_read(self.pair[0]) @@ -1682,11 +1983,12 @@ def init(event_model=None): def is_ready_cb(future): e.set() + self.on_ready(is_ready_cb) e.wait() Future.block_until_ready = _gevent_block_until_ready - elif event_model == 'debug': + elif event_model == "debug": import time class DebugEvent(object): @@ -1698,14 +2000,16 @@ def init(event_model=None): def wait(self): while not self.ev.isSet(): - self.ev.wait(.001) + self.ev.wait(0.001) + Future.Event = DebugEvent def _debug_block_until_ready(self): while not self.is_ready(): - time.sleep(.001) + time.sleep(0.001) + Future.block_until_ready = _debug_block_until_ready - elif event_model == 'asyncio': + elif event_model == "asyncio": global asyncio try: import asyncio @@ -1713,26 +2017,34 @@ def init(event_model=None): import trollius as asyncio if isinstance(asyncio.futures._FUTURE_CLASSES, type): - asyncio.futures._FUTURE_CLASSES = (asyncio.futures._FUTURE_CLASSES,) + asyncio.futures._FUTURE_CLASSES = ( + asyncio.futures._FUTURE_CLASSES, + ) asyncio.futures._FUTURE_CLASSES += (Future,) def _do_not_block(self): if not self.is_ready(): raise Exception("Future not ready") + Future.block_until_ready = _do_not_block - Future.call_soon_threadsafe = asyncio.get_event_loop().call_soon_threadsafe + Future.call_soon_threadsafe = ( + asyncio.get_event_loop().call_soon_threadsafe + ) Future._loop = asyncio.get_event_loop() def iterate(self): """Usage: - fa = tr.get_range(...).iterate() - for k,v in (yield From(fa)): - print(k,v) - yield From(fa)""" + fa = tr.get_range(...).iterate() + for k,v in (yield From(fa)): + print(k,v) + yield From(fa)""" + def it(): yield asyncio.From(self._future) raise asyncio.Return(self) + return it() + FDBRange.iterate = iterate AT = _TransactionCreator.declare_asynchronous_transactions() for name in dir(AT): @@ -1753,6 +2065,7 @@ def init(event_model=None): out.append(kv) yield asyncio.From(self._future) raise asyncio.Return(out) + FDBRange.to_list = to_list else: # Hard coded error @@ -1765,7 +2078,7 @@ def init(event_model=None): # been setup, so if we get here without exception we know # it has been. _network_thread.start() - except: + except Exception: # We assigned _network_thread but didn't succeed in init, # so clear it out so the next caller has a chance _network_thread = None @@ -1780,6 +2093,7 @@ open_databases = {} cacheLock = threading.Lock() + def open(cluster_file=None, event_model=None): """Opens the given database (or the default database of the cluster indicated by the fdb.cluster file in a platform-specific location, if no cluster_file @@ -1794,11 +2108,11 @@ def open(cluster_file=None, event_model=None): open_databases[cluster_file] = create_database(cluster_file) return open_databases[(cluster_file)] - -def open_v609(cluster_file=None, database_name=b'DB', event_model=None): - if database_name != b'DB': - raise FDBError(2013) # invalid_database_name + +def open_v609(cluster_file=None, database_name=b"DB", event_model=None): + if database_name != b"DB": + raise FDBError(2013) # invalid_database_name return open(cluster_file, event_model) @@ -1807,9 +2121,6 @@ def open_v13(cluster_id_path, database_name, local_address=None, event_model=Non return open_v609(cluster_id_path, database_name, event_model) -import atexit - - @atexit.register def _stop_on_exit(): if _network_thread: @@ -1818,8 +2129,8 @@ def _stop_on_exit(): def strinc(key): - key = key.rstrip(b'\xff') + key = key.rstrip(b"\xff") if len(key) == 0: - raise ValueError('Key must contain at least one byte not equal to 0xFF.') + raise ValueError("Key must contain at least one byte not equal to 0xFF.") return key[:-1] + six.int2byte(ord(key[-1:]) + 1) diff --git a/bindings/python/fdb/locality.py b/bindings/python/fdb/locality.py index d6f6e15201..f46d9f9276 100644 --- a/bindings/python/fdb/locality.py +++ b/bindings/python/fdb/locality.py @@ -40,13 +40,15 @@ def _get_boundary_keys(db_or_tr, begin, end): lastbegin = begin tr.options.set_read_system_keys() tr.options.set_lock_aware() - kvs = tr.snapshot.get_range(b'\xff' + b'/keyServers/' + begin, b'\xff' + b'/keyServers/' + end) + kvs = tr.snapshot.get_range( + b"\xff" + b"/keyServers/" + begin, b"\xff" + b"/keyServers/" + end + ) if first_time: first_time = False yield None # trick to get the above get_range to be asynchronously dispatched before get_boundary_keys() returns. for kv in kvs: yield kv.key[13:] - begin = kv.key[13:] + b'\x00' + begin = kv.key[13:] + b"\x00" begin = end except _impl.FDBError as e: # if we get a transaction_too_old and *something* has happened, then we are no longer transactional @@ -71,4 +73,8 @@ def get_boundary_keys(db_or_tr, begin, end): @_impl.transactional def get_addresses_for_key(tr, key): keyBytes = _impl.keyToBytes(key) - return _impl.FutureStringArray(tr.capi.fdb_transaction_get_addresses_for_key(tr.tpointer, keyBytes, len(keyBytes))) + return _impl.FutureStringArray( + tr.capi.fdb_transaction_get_addresses_for_key( + tr.tpointer, keyBytes, len(keyBytes) + ) + ) diff --git a/bindings/python/fdb/subspace_impl.py b/bindings/python/fdb/subspace_impl.py index 9139ca8cc9..3a713f25dd 100644 --- a/bindings/python/fdb/subspace_impl.py +++ b/bindings/python/fdb/subspace_impl.py @@ -23,13 +23,12 @@ import fdb.tuple -class Subspace (object): - - def __init__(self, prefixTuple=tuple(), rawPrefix=b''): +class Subspace(object): + def __init__(self, prefixTuple=tuple(), rawPrefix=b""): self.rawPrefix = fdb.tuple.pack(prefixTuple, prefix=rawPrefix) def __repr__(self): - return 'Subspace(rawPrefix=' + repr(self.rawPrefix) + ')' + return "Subspace(rawPrefix=" + repr(self.rawPrefix) + ")" def __getitem__(self, name): return Subspace((name,), self.rawPrefix) @@ -45,7 +44,7 @@ class Subspace (object): def unpack(self, key): if not self.contains(key): - raise ValueError('Cannot unpack key that is not in subspace.') + raise ValueError("Cannot unpack key that is not in subspace.") return fdb.tuple.unpack(key, prefix_len=len(self.rawPrefix)) diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index ebe36594a5..42fb5c9c79 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -25,9 +25,10 @@ https://apple.github.io/foundationdb/api-python.html""" from fdb import impl as _impl -_tenant_map_prefix = b'\xff\xff/management/tenant/map/' +_tenant_map_prefix = b"\xff\xff/management/tenant/map/" -# If the existence_check_marker is an empty list, then check whether the tenant exists. + +# If the existence_check_marker is an empty list, then check whether the tenant exists. # After the check, append an item to the existence_check_marker list so that subsequent # calls to this function will not perform the existence check. # @@ -37,11 +38,12 @@ def _check_tenant_existence(tr, key, existence_check_marker, force_maybe_commite existing_tenant = tr[key].wait() existence_check_marker.append(None) if force_maybe_commited: - raise _impl.FDBError(1021) # maybe_committed + raise _impl.FDBError(1021) # maybe_committed return existing_tenant != None return None + # Attempt to create a tenant in the cluster. If existence_check_marker is an empty # list, then this function will check if the tenant already exists and fail if it does. # Once the existence check is completed, it will not be done again if this function @@ -51,15 +53,23 @@ def _check_tenant_existence(tr, key, existence_check_marker, force_maybe_commite # # If the existence_check_marker is a non-empty list, then the existence check is skipped. @_impl.transactional -def _create_tenant_impl(tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False): +def _create_tenant_impl( + tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False +): tr.options.set_special_key_space_enable_writes() - key = b'%s%s' % (_tenant_map_prefix, tenant_name) + key = b"%s%s" % (_tenant_map_prefix, tenant_name) + + if ( + _check_tenant_existence( + tr, key, existence_check_marker, force_existence_check_maybe_committed + ) + is True + ): + raise _impl.FDBError(2132) # tenant_already_exists + + tr[key] = b"" - if _check_tenant_existence(tr, key, existence_check_marker, force_existence_check_maybe_committed) is True: - raise _impl.FDBError(2132) # tenant_already_exists - tr[key] = b'' - # Attempt to delete a tenant from the cluster. If existence_check_marker is an empty # list, then this function will check if the tenant already exists and fail if it does # not. Once the existence check is completed, it will not be done again if this function @@ -69,15 +79,23 @@ def _create_tenant_impl(tr, tenant_name, existence_check_marker, force_existence # # If the existence_check_marker is a non-empty list, then the existence check is skipped. @_impl.transactional -def _delete_tenant_impl(tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False): +def _delete_tenant_impl( + tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False +): tr.options.set_special_key_space_enable_writes() - key = b'%s%s' % (_tenant_map_prefix, tenant_name) + key = b"%s%s" % (_tenant_map_prefix, tenant_name) - if _check_tenant_existence(tr, key, existence_check_marker, force_existence_check_maybe_committed) is False: - raise _impl.FDBError(2131) # tenant_not_found + if ( + _check_tenant_existence( + tr, key, existence_check_marker, force_existence_check_maybe_committed + ) + is False + ): + raise _impl.FDBError(2131) # tenant_not_found del tr[key] + class FDBTenantList(object): """Iterates over the results of list_tenants query. Returns KeyValue objects. @@ -96,6 +114,7 @@ class FDBTenantList(object): tenant_name = _impl.remove_prefix(next_item.key, _tenant_map_prefix) yield _impl.KeyValue(tenant_name, next_item.value) + # Lists the tenants created in the cluster, specified by the begin and end range. # Also limited in number of results by the limit parameter. # Returns an iterable object that yields KeyValue objects @@ -104,29 +123,36 @@ class FDBTenantList(object): @_impl.transactional def _list_tenants_impl(tr, begin, end, limit): tr.options.set_raw_access() - begin_key = b'%s%s' % (_tenant_map_prefix, begin) - end_key = b'%s%s' % (_tenant_map_prefix, end) + begin_key = b"%s%s" % (_tenant_map_prefix, begin) + end_key = b"%s%s" % (_tenant_map_prefix, end) rangeresult = tr.get_range(begin_key, end_key, limit) return FDBTenantList(rangeresult) + def create_tenant(db_or_tr, tenant_name): tenant_name = _impl.process_tenant_name(tenant_name) # Only perform the existence check when run using a database # Callers using a transaction are expected to check existence themselves if required - existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None] + existence_check_marker = ( + [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None] + ) _create_tenant_impl(db_or_tr, tenant_name, existence_check_marker) + def delete_tenant(db_or_tr, tenant_name): tenant_name = _impl.process_tenant_name(tenant_name) # Only perform the existence check when run using a database # Callers using a transaction are expected to check existence themselves if required - existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None] + existence_check_marker = ( + [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None] + ) _delete_tenant_impl(db_or_tr, tenant_name, existence_check_marker) + def list_tenants(db_or_tr, begin, end, limit): begin = _impl.process_tenant_name(begin) end = _impl.process_tenant_name(end) diff --git a/bindings/python/fdb/tuple.py b/bindings/python/fdb/tuple.py index 379e576cdc..d96319b73e 100644 --- a/bindings/python/fdb/tuple.py +++ b/bindings/python/fdb/tuple.py @@ -39,8 +39,8 @@ BYTES_CODE = 0x01 STRING_CODE = 0x02 NESTED_CODE = 0x05 INT_ZERO_CODE = 0x14 -POS_INT_END = 0x1d -NEG_INT_START = 0x0b +POS_INT_END = 0x1D +NEG_INT_START = 0x0B FLOAT_CODE = 0x20 DOUBLE_CODE = 0x21 FALSE_CODE = 0x26 @@ -54,10 +54,10 @@ VERSIONSTAMP_CODE = 0x33 def _find_terminator(v, pos): # Finds the start of the next terminator [\x00]![\xff] or the end of v while True: - pos = v.find(b'\x00', pos) + pos = v.find(b"\x00", pos) if pos < 0: return len(v) - if pos + 1 == len(v) or v[pos + 1:pos + 2] != b'\xff': + if pos + 1 == len(v) or v[pos + 1 : pos + 2] != b"\xff": return pos pos += 2 @@ -66,9 +66,9 @@ def _find_terminator(v, pos): # If decoding and sign bit is 0 (negative), flip all of the bits. Otherwise, just flip sign. def _float_adjust(v, encode): if encode and six.indexbytes(v, 0) & 0x80 != 0x00: - return b''.join(map(lambda x: six.int2byte(x ^ 0xff), six.iterbytes(v))) + return b"".join(map(lambda x: six.int2byte(x ^ 0xFF), six.iterbytes(v))) elif not encode and six.indexbytes(v, 0) & 0x80 != 0x80: - return b''.join(map(lambda x: six.int2byte(x ^ 0xff), six.iterbytes(v))) + return b"".join(map(lambda x: six.int2byte(x ^ 0xFF), six.iterbytes(v))) else: return six.int2byte(six.indexbytes(v, 0) ^ 0x80) + v[1:] @@ -84,7 +84,9 @@ class SingleFloat(object): elif isinstance(value, six.integer_types): self.value = ctypes.c_float(value).value else: - raise ValueError("Incompatible type for single-precision float: " + repr(value)) + raise ValueError( + "Incompatible type for single-precision float: " + repr(value) + ) # Comparisons def __eq__(self, other): @@ -119,24 +121,42 @@ class Versionstamp(object): LENGTH = 12 _TR_VERSION_LEN = 10 _MAX_USER_VERSION = (1 << 16) - 1 - _UNSET_TR_VERSION = 10 * six.int2byte(0xff) - _STRUCT_FORMAT_STRING = '>' + str(_TR_VERSION_LEN) + 'sH' + _UNSET_TR_VERSION = 10 * six.int2byte(0xFF) + _STRUCT_FORMAT_STRING = ">" + str(_TR_VERSION_LEN) + "sH" @classmethod def validate_tr_version(cls, tr_version): if tr_version is None: return if not isinstance(tr_version, bytes): - raise TypeError("Global version has illegal type " + str(type(tr_version)) + " (requires bytes)") + raise TypeError( + "Global version has illegal type " + + str(type(tr_version)) + + " (requires bytes)" + ) elif len(tr_version) != cls._TR_VERSION_LEN: - raise ValueError("Global version has incorrect length " + str(len(tr_version)) + " (requires " + str(cls._TR_VERSION_LEN) + ")") + raise ValueError( + "Global version has incorrect length " + + str(len(tr_version)) + + " (requires " + + str(cls._TR_VERSION_LEN) + + ")" + ) @classmethod def validate_user_version(cls, user_version): if not isinstance(user_version, six.integer_types): - raise TypeError("Local version has illegal type " + str(type(user_version)) + " (requires integer type)") + raise TypeError( + "Local version has illegal type " + + str(type(user_version)) + + " (requires integer type)" + ) elif user_version < 0 or user_version > cls._MAX_USER_VERSION: - raise ValueError("Local version has value " + str(user_version) + " which is out of range") + raise ValueError( + "Local version has value " + + str(user_version) + + " which is out of range" + ) def __init__(self, tr_version=None, user_version=0): Versionstamp.validate_tr_version(tr_version) @@ -153,30 +173,50 @@ class Versionstamp(object): if not isinstance(v, bytes): raise TypeError("Cannot parse versionstamp from non-byte string") elif len(v) - start < cls.LENGTH: - raise ValueError("Versionstamp byte string is too short (only " + str(len(v) - start) + " bytes to read from") + raise ValueError( + "Versionstamp byte string is too short (only " + + str(len(v) - start) + + " bytes to read from" + ) else: - tr_version = v[start:start + cls._TR_VERSION_LEN] + tr_version = v[start : start + cls._TR_VERSION_LEN] if tr_version == cls._UNSET_TR_VERSION: tr_version = None - user_version = six.indexbytes(v, start + cls._TR_VERSION_LEN) * (1 << 8) + six.indexbytes(v, start + cls._TR_VERSION_LEN + 1) + user_version = six.indexbytes(v, start + cls._TR_VERSION_LEN) * ( + 1 << 8 + ) + six.indexbytes(v, start + cls._TR_VERSION_LEN + 1) return Versionstamp(tr_version, user_version) def is_complete(self): return self.tr_version is not None def __repr__(self): - return "fdb.tuple.Versionstamp(" + repr(self.tr_version) + ", " + repr(self.user_version) + ")" + return ( + "fdb.tuple.Versionstamp(" + + repr(self.tr_version) + + ", " + + repr(self.user_version) + + ")" + ) def __str__(self): - return "Versionstamp(" + repr(self.tr_version) + ", " + str(self.user_version) + ")" + return ( + "Versionstamp(" + + repr(self.tr_version) + + ", " + + str(self.user_version) + + ")" + ) def to_bytes(self): tr_version = self.tr_version if isinstance(tr_version, fdb.impl.Value): tr_version = tr_version.value - return struct.pack(self._STRUCT_FORMAT_STRING, - tr_version if self.is_complete() else self._UNSET_TR_VERSION, - self.user_version) + return struct.pack( + self._STRUCT_FORMAT_STRING, + tr_version if self.is_complete() else self._UNSET_TR_VERSION, + self.user_version, + ) def completed(self, new_tr_version): if self.is_complete(): @@ -187,7 +227,10 @@ class Versionstamp(object): # Comparisons def __eq__(self, other): if isinstance(other, Versionstamp): - return self.tr_version == other.tr_version and self.user_version == other.user_version + return ( + self.tr_version == other.tr_version + and self.user_version == other.user_version + ) else: return False @@ -224,18 +267,22 @@ def _decode(v, pos): return None, pos + 1 elif code == BYTES_CODE: end = _find_terminator(v, pos + 1) - return v[pos + 1:end].replace(b"\x00\xFF", b"\x00"), end + 1 + return v[pos + 1 : end].replace(b"\x00\xFF", b"\x00"), end + 1 elif code == STRING_CODE: end = _find_terminator(v, pos + 1) - return v[pos + 1:end].replace(b"\x00\xFF", b"\x00").decode("utf-8"), end + 1 + return v[pos + 1 : end].replace(b"\x00\xFF", b"\x00").decode("utf-8"), end + 1 elif code >= INT_ZERO_CODE and code < POS_INT_END: n = code - 20 end = pos + 1 + n - return struct.unpack(">Q", b'\x00' * (8 - n) + v[pos + 1:end])[0], end + return struct.unpack(">Q", b"\x00" * (8 - n) + v[pos + 1 : end])[0], end elif code > NEG_INT_START and code < INT_ZERO_CODE: n = 20 - code end = pos + 1 + n - return struct.unpack(">Q", b'\x00' * (8 - n) + v[pos + 1:end])[0] - _size_limits[n], end + return ( + struct.unpack(">Q", b"\x00" * (8 - n) + v[pos + 1 : end])[0] + - _size_limits[n], + end, + ) elif code == POS_INT_END: # 0x1d; Positive 9-255 byte integer length = six.indexbytes(v, pos + 1) val = 0 @@ -244,25 +291,37 @@ def _decode(v, pos): val += six.indexbytes(v, pos + 2 + i) return val, pos + 2 + length elif code == NEG_INT_START: # 0x0b; Negative 9-255 byte integer - length = six.indexbytes(v, pos + 1) ^ 0xff + length = six.indexbytes(v, pos + 1) ^ 0xFF val = 0 for i in _range(length): val = val << 8 val += six.indexbytes(v, pos + 2 + i) return val - (1 << (length * 8)) + 1, pos + 2 + length elif code == FLOAT_CODE: - return SingleFloat(struct.unpack(">f", _float_adjust(v[pos + 1:pos + 5], False))[0]), pos + 5 + return ( + SingleFloat( + struct.unpack(">f", _float_adjust(v[pos + 1 : pos + 5], False))[0] + ), + pos + 5, + ) elif code == DOUBLE_CODE: - return struct.unpack(">d", _float_adjust(v[pos + 1:pos + 9], False))[0], pos + 9 + return ( + struct.unpack(">d", _float_adjust(v[pos + 1 : pos + 9], False))[0], + pos + 9, + ) elif code == UUID_CODE: - return uuid.UUID(bytes=v[pos + 1:pos + 17]), pos + 17 + return uuid.UUID(bytes=v[pos + 1 : pos + 17]), pos + 17 elif code == FALSE_CODE: if fdb.is_api_version_selected() and fdb.get_api_version() < 500: - raise ValueError("Invalid API version " + str(fdb._version) + " for boolean types") + raise ValueError( + "Invalid API version " + str(fdb._version) + " for boolean types" + ) return False, pos + 1 elif code == TRUE_CODE: if fdb.is_api_version_selected() and fdb.get_api_version() < 500: - raise ValueError("Invalid API version " + str(fdb._version) + " for boolean types") + raise ValueError( + "Invalid API version " + str(fdb._version) + " for boolean types" + ) return True, pos + 1 elif code == VERSIONSTAMP_CODE: return Versionstamp.from_bytes(v, pos + 1), pos + 1 + Versionstamp.LENGTH @@ -271,7 +330,7 @@ def _decode(v, pos): end_pos = pos + 1 while end_pos < len(v): if six.indexbytes(v, end_pos) == 0x00: - if end_pos + 1 < len(v) and six.indexbytes(v, end_pos + 1) == 0xff: + if end_pos + 1 < len(v) and six.indexbytes(v, end_pos + 1) == 0xFF: ret.append(None) end_pos += 2 else: @@ -299,11 +358,15 @@ def _reduce_children(child_values): if sys.version_info < (2, 7): + def _bit_length(x): - s = bin(x) # binary representation: bin(-37) --> '-0b100101' - s = s.lstrip('-0b') # remove leading zeros and minus sign + s = bin(x) # binary representation: bin(-37) --> '-0b100101' + s = s.lstrip("-0b") # remove leading zeros and minus sign return len(s) + + else: + def _bit_length(x): return x.bit_length() @@ -314,23 +377,33 @@ def _encode(value, nested=False): # sorting need to work too! if value == None: # ==, not is, because some fdb.impl.Value are equal to None if nested: - return b''.join([six.int2byte(NULL_CODE), six.int2byte(0xff)]), -1 + return b"".join([six.int2byte(NULL_CODE), six.int2byte(0xFF)]), -1 else: - return b''.join([six.int2byte(NULL_CODE)]), -1 + return b"".join([six.int2byte(NULL_CODE)]), -1 elif isinstance(value, bytes): # also gets non-None fdb.impl.Value - return six.int2byte(BYTES_CODE) + value.replace(b'\x00', b'\x00\xFF') + b'\x00', -1 + return ( + six.int2byte(BYTES_CODE) + value.replace(b"\x00", b"\x00\xFF") + b"\x00", + -1, + ) elif isinstance(value, six.text_type): - return six.int2byte(STRING_CODE) + value.encode('utf-8').replace(b'\x00', b'\x00\xFF') + b'\x00', -1 - elif isinstance(value, six.integer_types) and (not isinstance(value, bool) or (hasattr(fdb, '_version') and fdb._version < 500)): + return ( + six.int2byte(STRING_CODE) + + value.encode("utf-8").replace(b"\x00", b"\x00\xFF") + + b"\x00", + -1, + ) + elif isinstance(value, six.integer_types) and ( + not isinstance(value, bool) or (hasattr(fdb, "_version") and fdb._version < 500) + ): if value == 0: - return b''.join([six.int2byte(INT_ZERO_CODE)]), -1 + return b"".join([six.int2byte(INT_ZERO_CODE)]), -1 elif value > 0: if value >= _size_limits[-1]: length = (_bit_length(value) + 7) // 8 data = [six.int2byte(POS_INT_END), six.int2byte(length)] for i in _range(length - 1, -1, -1): - data.append(six.int2byte((value >> (8 * i)) & 0xff)) - return b''.join(data), -1 + data.append(six.int2byte((value >> (8 * i)) & 0xFF)) + return b"".join(data), -1 n = bisect_left(_size_limits, value) return six.int2byte(INT_ZERO_CODE + n) + struct.pack(">Q", value)[-n:], -1 @@ -338,34 +411,53 @@ def _encode(value, nested=False): if -value >= _size_limits[-1]: length = (_bit_length(value) + 7) // 8 value += (1 << (length * 8)) - 1 - data = [six.int2byte(NEG_INT_START), six.int2byte(length ^ 0xff)] + data = [six.int2byte(NEG_INT_START), six.int2byte(length ^ 0xFF)] for i in _range(length - 1, -1, -1): - data.append(six.int2byte((value >> (8 * i)) & 0xff)) - return b''.join(data), -1 + data.append(six.int2byte((value >> (8 * i)) & 0xFF)) + return b"".join(data), -1 n = bisect_left(_size_limits, -value) maxv = _size_limits[n] - return six.int2byte(INT_ZERO_CODE - n) + struct.pack(">Q", maxv + value)[-n:], -1 + return ( + six.int2byte(INT_ZERO_CODE - n) + struct.pack(">Q", maxv + value)[-n:], + -1, + ) elif isinstance(value, ctypes.c_float) or isinstance(value, SingleFloat): - return six.int2byte(FLOAT_CODE) + _float_adjust(struct.pack(">f", value.value), True), -1 + return ( + six.int2byte(FLOAT_CODE) + + _float_adjust(struct.pack(">f", value.value), True), + -1, + ) elif isinstance(value, ctypes.c_double): - return six.int2byte(DOUBLE_CODE) + _float_adjust(struct.pack(">d", value.value), True), -1 + return ( + six.int2byte(DOUBLE_CODE) + + _float_adjust(struct.pack(">d", value.value), True), + -1, + ) elif isinstance(value, float): - return six.int2byte(DOUBLE_CODE) + _float_adjust(struct.pack(">d", value), True), -1 + return ( + six.int2byte(DOUBLE_CODE) + _float_adjust(struct.pack(">d", value), True), + -1, + ) elif isinstance(value, uuid.UUID): return six.int2byte(UUID_CODE) + value.bytes, -1 elif isinstance(value, bool): if value: - return b''.join([six.int2byte(TRUE_CODE)]), -1 + return b"".join([six.int2byte(TRUE_CODE)]), -1 else: - return b''.join([six.int2byte(FALSE_CODE)]), -1 + return b"".join([six.int2byte(FALSE_CODE)]), -1 elif isinstance(value, Versionstamp): version_pos = -1 if value.is_complete() else 1 return six.int2byte(VERSIONSTAMP_CODE) + value.to_bytes(), version_pos elif isinstance(value, tuple) or isinstance(value, list): - child_bytes, version_pos = _reduce_children(map(lambda x: _encode(x, True), value)) + child_bytes, version_pos = _reduce_children( + map(lambda x: _encode(x, True), value) + ) new_version_pos = -1 if version_pos < 0 else version_pos + 1 - return b''.join([six.int2byte(NESTED_CODE)] + child_bytes + [six.int2byte(0x00)]), new_version_pos + return ( + b"".join([six.int2byte(NESTED_CODE)] + child_bytes + [six.int2byte(0x00)]), + new_version_pos, + ) else: raise ValueError("Unsupported data type: " + str(type(value))) @@ -387,13 +479,13 @@ def _pack_maybe_with_versionstamp(t, prefix=None): version_pos += len(prefix) if prefix is not None else 0 bytes_list.extend(child_bytes) if fdb.is_api_version_selected() and fdb.get_api_version() < 520: - bytes_list.append(struct.pack('= 500) and isinstance(value, bool): + elif (not hasattr(fdb, "_version") or fdb._version >= 500) and isinstance( + value, bool + ): return FALSE_CODE elif isinstance(value, six.integer_types): return INT_ZERO_CODE @@ -514,8 +609,8 @@ def _compare_values(value1, value2): if code1 == NULL_CODE: return 0 elif code1 == STRING_CODE: - encoded1 = value1.encode('utf-8') - encoded2 = value2.encode('utf-8') + encoded1 = value1.encode("utf-8") + encoded2 = value2.encode("utf-8") return -1 if encoded1 < encoded2 else 0 if encoded1 == encoded2 else 1 elif code1 == FLOAT_CODE: f1 = value1 if isinstance(value1, SingleFloat) else SingleFloat(value1.value) diff --git a/bindings/python/tests/cancellation_timeout_tests.py b/bindings/python/tests/cancellation_timeout_tests.py index 32d11b0e34..341e6d0da4 100755 --- a/bindings/python/tests/cancellation_timeout_tests.py +++ b/bindings/python/tests/cancellation_timeout_tests.py @@ -518,7 +518,7 @@ def test_timeouts(db): for i in range(2): tr.options.set_timeout(1500) tr.set_read_version(0x7ffffffffffffff0) - x = tr[b'foo'] + _ = tr[b'foo'] try: tr.commit().wait() tr.reset() @@ -557,7 +557,7 @@ def test_db_timeouts(db): tr[b'foo'] = b'bar' tr.on_error(err).wait() # should not throw time.sleep(1) - tr[b'foo'] + _ = tr[b'foo'] try: tr.commit().wait() # should throw raise TestError("(2) Timeout didn't fire.") @@ -574,7 +574,7 @@ def test_db_timeouts(db): time.sleep(0.75) tr[b'foo'] = b'bar' tr.on_error(err).wait() # should not throw - tr[b'foo'] + _ = tr[b'foo'] time.sleep(0.75) try: tr.commit().wait() # should throw @@ -615,7 +615,7 @@ def test_db_timeouts(db): tr.reset() tr[b'foo'] = b'bar' time.sleep(0.2) - tr.on_error(err).wait() #should not throw + tr.on_error(err).wait() # should not throw tr[b'foo'] = b'bar' time.sleep(0.8) try: diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py index b94d7ea8e4..ff6fb52cac 100644 --- a/bindings/python/tests/size_limit_tests.py +++ b/bindings/python/tests/size_limit_tests.py @@ -24,15 +24,18 @@ import sys if __name__ == '__main__': fdb.api_version(720) + @fdb.transactional def setValue(tr, key, value): tr[key] = value + @fdb.transactional def setValueWithLimit(tr, key, value, limit): tr.options.set_size_limit(limit) tr[key] = value + def test_size_limit_option(db): value = b'a' * 1024 @@ -69,6 +72,7 @@ def test_size_limit_option(db): # Reset the size limit for future tests db.options.set_transaction_size_limit(10000000) + @fdb.transactional def test_get_approximate_size(tr): tr[b'key1'] = b'value1' @@ -90,6 +94,7 @@ def test_get_approximate_size(tr): s5 = tr.get_approximate_size().wait() assert(s4 < s5) + # Expect a cluster file as input. This test will write to the FDB cluster, so # be aware of potential side effects. if __name__ == '__main__': diff --git a/bindings/python/tests/tenant_tests.py b/bindings/python/tests/tenant_tests.py index 7604577d62..033cf04b37 100755 --- a/bindings/python/tests/tenant_tests.py +++ b/bindings/python/tests/tenant_tests.py @@ -27,24 +27,26 @@ from fdb.tuple import pack if __name__ == '__main__': fdb.api_version(720) + def cleanup_tenant(db, tenant_name): try: tenant = db.open_tenant(tenant_name) del tenant[:] fdb.tenant_management.delete_tenant(db, tenant_name) except fdb.FDBError as e: - if e.code == 2131: # tenant not found + if e.code == 2131: # tenant not found pass else: raise + def test_tenant_tuple_name(db): - tuplename=(b'test', b'level', b'hierarchy', 3, 1.24, 'str') + tuplename = (b'test', b'level', b'hierarchy', 3, 1.24, 'str') cleanup_tenant(db, tuplename) fdb.tenant_management.create_tenant(db, tuplename) - tenant=db.open_tenant(tuplename) + tenant = db.open_tenant(tuplename) tenant[b'foo'] = b'bar' assert tenant[b'foo'] == b'bar' @@ -100,7 +102,7 @@ def test_tenant_operations(db): del tr1[:] tr1.commit().wait() except fdb.FDBError as e: - tr.on_error(e).wait() + tr1.on_error(e).wait() assert tenant1[b'tenant_test_key'] == None assert db[prefix1 + b'tenant_test_key'] == None @@ -113,7 +115,7 @@ def test_tenant_operations(db): tenant1[b'tenant_test_key'] assert False except fdb.FDBError as e: - assert e.code == 2131 # tenant not found + assert e.code == 2131 # tenant not found del tenant2[:] fdb.tenant_management.delete_tenant(db, b'tenant2') @@ -126,6 +128,7 @@ def test_tenant_operations(db): assert db[b'tenant_test_key'] == None + def test_tenant_operation_retries(db): cleanup_tenant(db, b'tenant1') cleanup_tenant(db, b'tenant2') @@ -138,7 +141,7 @@ def test_tenant_operation_retries(db): fdb.tenant_management.create_tenant(db, b'tenant1') assert False except fdb.FDBError as e: - assert e.code == 2132 # tenant already exists + assert e.code == 2132 # tenant already exists # Using a transaction skips the existence check tr = db.create_transaction() @@ -166,7 +169,7 @@ def test_tenant_operation_retries(db): fdb.tenant_management.delete_tenant(db, b'tenant1') assert False except fdb.FDBError as e: - assert e.code == 2131 # tenant not found + assert e.code == 2131 # tenant not found # Using a transaction skips the existence check tr = db.create_transaction() @@ -186,11 +189,13 @@ def test_tenant_operation_retries(db): except fdb.FDBError as e: tr.on_error(e).wait() + def test_tenants(db): test_tenant_tuple_name(db) test_tenant_operations(db) test_tenant_operation_retries(db) + # Expect a cluster file as input. This test will write to the FDB cluster, so # be aware of potential side effects. if __name__ == '__main__': diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 4392d02015..18f8494ed7 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -26,7 +26,6 @@ import sys import os import struct import threading -import time import random import time import traceback @@ -136,7 +135,7 @@ def test_fdb_transactional_generator(db): def function_that_yields(tr): yield 0 assert fdb.get_api_version() < 630, "Pre-6.3, a decorator may wrap a function that yields" - except ValueError as e: + except ValueError: assert fdb.get_api_version() >= 630, "Post-6.3, a decorator should throw if wrapped function yields" @@ -144,12 +143,13 @@ def test_fdb_transactional_returns_generator(db): try: def function_that_yields(tr): yield 0 + @fdb.transactional def function_that_returns(tr): return function_that_yields(tr) function_that_returns() assert fdb.get_api_version() < 630, "Pre-6.3, returning a generator is allowed" - except ValueError as e: + except ValueError: assert fdb.get_api_version() >= 630, "Post-6.3, returning a generator should throw" @@ -400,11 +400,11 @@ class Tester: inst.push(f) elif inst.op == six.u("GET_ESTIMATED_RANGE_SIZE"): begin, end = inst.pop(2) - estimatedSize = obj.get_estimated_range_size_bytes(begin, end).wait() + obj.get_estimated_range_size_bytes(begin, end).wait() inst.push(b"GOT_ESTIMATED_RANGE_SIZE") elif inst.op == six.u("GET_RANGE_SPLIT_POINTS"): begin, end, chunkSize = inst.pop(3) - estimatedSize = obj.get_range_split_points(begin, end, chunkSize).wait() + obj.get_range_split_points(begin, end, chunkSize).wait() inst.push(b"GOT_RANGE_SPLIT_POINTS") elif inst.op == six.u("GET_KEY"): key, or_equal, offset, prefix = inst.pop(4) @@ -522,7 +522,7 @@ class Tester: self.last_version = inst.tr.get_committed_version() inst.push(b"GOT_COMMITTED_VERSION") elif inst.op == six.u("GET_APPROXIMATE_SIZE"): - approximate_size = inst.tr.get_approximate_size().wait() + inst.tr.get_approximate_size().wait() inst.push(b"GOT_APPROXIMATE_SIZE") elif inst.op == six.u("GET_VERSIONSTAMP"): inst.push(inst.tr.get_versionstamp()) @@ -613,9 +613,9 @@ class Tester: result += [tenant.key] try: metadata = json.loads(tenant.value) - id = metadata["id"] - prefix = metadata["prefix"] - except (json.decoder.JSONDecodeError, KeyError) as e: + _ = metadata["id"] + _ = metadata["prefix"] + except (json.decoder.JSONDecodeError, KeyError): assert False, "Invalid Tenant Metadata" inst.push(fdb.tuple.pack(tuple(result))) elif inst.op == six.u("UNIT_TESTS"): diff --git a/bindings/python/tests/tuple_tests.py b/bindings/python/tests/tuple_tests.py index 026ccd6250..88481ebf8a 100644 --- a/bindings/python/tests/tuple_tests.py +++ b/bindings/python/tests/tuple_tests.py @@ -173,7 +173,7 @@ def tupleTest(N=10000): print("Prefix not before prefixed:\n Tuple: %s\n Bytes: %s\n Other: %s\n Bytes: %s" % (t, repr(pack(t)), t2, repr(pack(t2)))) return False - print ("Tuple check %d OK" % N) + print("Tuple check %d OK" % N) return True # test: diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 5fc6849d67..cbe0354129 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -622,3 +622,39 @@ function(add_java_test) -Djava.library.path=${CMAKE_BINARY_DIR}/lib ${T_CLASS} "@CLUSTER_FILE@") endfunction() + +# Adds a FDB test implemented by a script that does the full setup, such as creating cluster +# and running client binaries as necessary +function(add_scripted_fdb_test) + set(options DISABLED ENABLED) + set(oneValueArgs NAME TEST_TIMEOUT) + set(multiValueArgs COMMAND) + cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + if(OPEN_FOR_IDE) + return() + endif() + if(NOT T_ENABLED AND T_DISABLED) + return() + endif() + if(NOT T_NAME) + message(FATAL_ERROR "NAME is a required argument for add_scripted_fdb_test") + endif() + if(NOT T_COMMAND) + message(FATAL_ERROR "COMMAND is a required argument for add_scripted_fdb_test") + endif() + message(STATUS "Adding Scripted FDB test ${T_NAME}") + add_test(NAME "${T_NAME}" + COMMAND ${T_COMMAND}) + set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT + "${SANITIZER_OPTIONS};PYTHONPATH=${CMAKE_SOURCE_DIR}/tests/TestRunner:${CMAKE_BINARY_DIR}/tests/TestRunner") + if (T_TEST_TIMEOUT) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT}) + else() + # default timeout + if(USE_SANITIZER) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 1200) + else() + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + endif() + endif() +endfunction() diff --git a/cmake/CompileRocksDB.cmake b/cmake/CompileRocksDB.cmake index 3fdea389ab..f257443c80 100644 --- a/cmake/CompileRocksDB.cmake +++ b/cmake/CompileRocksDB.cmake @@ -1,6 +1,6 @@ # FindRocksDB -find_package(RocksDB 6.27.3) +find_package(RocksDB 7.7.3) include(ExternalProject) @@ -49,8 +49,8 @@ if(ROCKSDB_FOUND) ${BINARY_DIR}/librocksdb.a) else() ExternalProject_Add(rocksdb - URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz - URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 + URL https://github.com/facebook/rocksdb/archive/refs/tags/v7.7.3.tar.gz + URL_HASH SHA256=b8ac9784a342b2e314c821f6d701148912215666ac5e9bdbccd93cf3767cb611 CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" diff --git a/contrib/grv_proxy_model/grv_test.py b/contrib/grv_proxy_model/grv_test.py index 1cd0224538..b531a827e9 100755 --- a/contrib/grv_proxy_model/grv_test.py +++ b/contrib/grv_proxy_model/grv_test.py @@ -42,6 +42,7 @@ parser.add_argument('--no-graph', action='store_true', default=False, help='Disa args = parser.parse_args() + def print_choices_list(context=None): if context == 'workload' or context is None: print('Workloads:') @@ -70,6 +71,7 @@ def print_choices_list(context=None): name = name[0:-len('Limiter')] print(' %s' % name) + if args.workload is None or args.ratekeeper is None: print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n') print_choices_list() @@ -79,16 +81,18 @@ if args.list: print_choices_list() sys.exit(0) + def validate_class_type(var, name, superclass): cls = getattr(var, name, None) return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass) -if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper: + +if args.ratekeeper not in ratekeeper_model.predefined_ratekeeper: print('Invalid ratekeeper model `%s\'' % args.ratekeeper) print_choices_list('ratekeeper') sys.exit(1) -if not args.workload in workload_model.predefined_workloads: +if args.workload not in workload_model.predefined_workloads: print('Invalid workload model `%s\'' % args.workload) print_choices_list('workload') sys.exit(1) @@ -120,11 +124,11 @@ for priority in workload.priorities(): still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority]) if len(latencies) > 0: - print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued)) - print(' Median latency: %f' % latencies[len(latencies)//2]) - print(' 90%% latency: %f' % latencies[int(0.9*len(latencies))]) - print(' 99%% latency: %f' % latencies[int(0.99*len(latencies))]) - print(' 99.9%% latency: %f' % latencies[int(0.999*len(latencies))]) + print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started) / proxy.time, still_queued)) + print(' Median latency: %f' % latencies[len(latencies) // 2]) + print(' 90%% latency: %f' % latencies[int(0.9 * len(latencies))]) + print(' 99%% latency: %f' % latencies[int(0.99 * len(latencies))]) + print(' 99.9%% latency: %f' % latencies[int(0.999 * len(latencies))]) print(' Max latency: %f' % latencies[-1]) print('') diff --git a/contrib/grv_proxy_model/plot.py b/contrib/grv_proxy_model/plot.py index 9334e2c844..7658e1f6cd 100755 --- a/contrib/grv_proxy_model/plot.py +++ b/contrib/grv_proxy_model/plot.py @@ -20,6 +20,7 @@ import matplotlib.pyplot as plt + class Plotter: def __init__(self, results): self.results = results @@ -28,13 +29,13 @@ class Plotter: out_data = {} counts = {} for t in data.keys(): - out_data.setdefault(t//time_resolution*time_resolution, 0) - counts.setdefault(t//time_resolution*time_resolution, 0) - out_data[t//time_resolution*time_resolution] += data[t] - counts[t//time_resolution*time_resolution] += 1 + out_data.setdefault(t // time_resolution * time_resolution, 0) + counts.setdefault(t // time_resolution * time_resolution, 0) + out_data[t // time_resolution * time_resolution] += data[t] + counts[t // time_resolution * time_resolution] += 1 if use_avg: - out_data = { t: v/counts[t] for t,v in out_data.items() } + out_data = {t: v / counts[t] for t, v in out_data.items()} plt.plot(list(out_data.keys()), list(out_data.values()), label=label) @@ -42,7 +43,7 @@ class Plotter: plt.plot(list(data.keys()), list(data.values()), label=label) def display(self, time_resolution=0.1): - plt.figure(figsize=(40,9)) + plt.figure(figsize=(40, 9)) plt.subplot(3, 3, 1) for priority in self.results.started.keys(): Plotter.add_plot(self.results.started[priority], time_resolution, priority) @@ -61,7 +62,7 @@ class Plotter: plt.subplot(3, 3, 3) for priority in self.results.unprocessed_queue_sizes.keys(): - data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()} + data = {k: max(v) for (k, v) in self.results.unprocessed_queue_sizes[priority].items()} Plotter.add_plot(data, time_resolution, priority) plt.xlabel('Time (s)') @@ -71,9 +72,11 @@ class Plotter: num = 4 for priority in self.results.latencies.keys(): plt.subplot(3, 3, num) - median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} - percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} - max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + median_latencies = {k: v[int(0.5 * len(v))] if len(v) > 0 else 0 for (k, v) in + self.results.latencies[priority].items()} + percentile90_latencies = {k: v[int(0.9 * len(v))] if len(v) > 0 else 0 for (k, v) in + self.results.latencies[priority].items()} + max_latencies = {k: max(v) if len(v) > 0 else 0 for (k, v) in self.results.latencies[priority].items()} Plotter.add_plot(median_latencies, time_resolution, 'median') Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile') @@ -94,7 +97,8 @@ class Plotter: if len(self.results.limit[priority]) > 0: Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True) if len(self.results.limit_and_budget[priority]) > 0: - Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True) + Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', + use_avg=True) if len(self.results.budget[priority]) > 0: Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True) @@ -104,4 +108,3 @@ class Plotter: num += 1 plt.show() - diff --git a/contrib/grv_proxy_model/priority.py b/contrib/grv_proxy_model/priority.py index 3ba5c05f2e..594c69eb8a 100755 --- a/contrib/grv_proxy_model/priority.py +++ b/contrib/grv_proxy_model/priority.py @@ -20,6 +20,7 @@ import functools + @functools.total_ordering class Priority: def __init__(self, priority_value, label): @@ -35,6 +36,7 @@ class Priority: def __repr__(self): return repr(self.label) + Priority.SYSTEM = Priority(0, "System") Priority.DEFAULT = Priority(1, "Default") Priority.BATCH = Priority(2, "Batch") diff --git a/contrib/grv_proxy_model/proxy_model.py b/contrib/grv_proxy_model/proxy_model.py index 9ca2a39bfe..5bd440e41e 100755 --- a/contrib/grv_proxy_model/proxy_model.py +++ b/contrib/grv_proxy_model/proxy_model.py @@ -25,6 +25,7 @@ import heapq from priority import Priority from smoother import Smoother + @functools.total_ordering class Task: def __init__(self, time, fxn): @@ -34,6 +35,7 @@ class Task: def __lt__(self, other): return self.time < other.time + class Limiter: class UpdateRateParams: def __init__(self, time): @@ -79,6 +81,7 @@ class Limiter: def update_budget(self, params): pass + class OriginalLimiter(Limiter): def __init__(self, priority, limit_rate_model, proxy_model): Limiter.__init__(self, priority, limit_rate_model, proxy_model) @@ -100,6 +103,7 @@ class OriginalLimiter(Limiter): def update_budget(self, params): self.limit -= params.num_started + class PositiveBudgetLimiter(OriginalLimiter): def __init__(self, priority, limit_rate_model, proxy_model): OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) @@ -108,6 +112,7 @@ class PositiveBudgetLimiter(OriginalLimiter): self.limit += params.elapsed * self.rate self.limit = min(self.limit, 2.0 * self.rate) + class ClampedBudgetLimiter(PositiveBudgetLimiter): def __init__(self, priority, limit_rate_model, proxy_model): PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) @@ -117,6 +122,7 @@ class ClampedBudgetLimiter(PositiveBudgetLimiter): if self.limit > min_budget: self.limit = max(self.limit - params.num_started, min_budget) + class TimeLimiter(PositiveBudgetLimiter): def __init__(self, priority, limit_rate_model, proxy_model): PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) @@ -126,15 +132,17 @@ class TimeLimiter(PositiveBudgetLimiter): return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params) def update_budget(self, params): - #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + # print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) if params.min_priority >= self.priority or params.num_started < self.limit: self.limit -= params.num_started else: self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch)) - self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate) + self.locked_until = min(params.time + 2.0, + max(params.time, self.locked_until) + (params.num_started - self.limit) / self.rate) + + # print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) - #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) class TimePositiveBudgetLimiter(PositiveBudgetLimiter): def __init__(self, priority, limit_rate_model, proxy_model): @@ -149,17 +157,18 @@ class TimePositiveBudgetLimiter(PositiveBudgetLimiter): return params.num_started + params.count <= self.limit def update_budget(self, params): - #if params.num_started > 0: - #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + # if params.num_started > 0: + # print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) if params.num_started > self.limit: - self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate) + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit) / self.rate) self.limit = 0 else: self.limit -= params.num_started - #if params.num_started > 0: - #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + # if params.num_started > 0: + # print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + class SmoothingLimiter(OriginalLimiter): def __init__(self, priority, limit_rate_model, proxy_model): @@ -177,7 +186,8 @@ class SmoothingLimiter(OriginalLimiter): self.smooth_rate_limit.set_total(params.time, self.rate) def update_limit(self, params): - self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + self.limit = 2.0 * ( + self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) def can_start(self, params): return params.num_started + params.count <= self.limit @@ -185,15 +195,17 @@ class SmoothingLimiter(OriginalLimiter): def update_budget(self, params): self.smooth_released.add_delta(params.time, params.num_started) + class SmoothingBudgetLimiter(SmoothingLimiter): def __init__(self, priority, limit_rate_model, proxy_model): SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model) - #self.smooth_filled = Smoother(2) + # self.smooth_filled = Smoother(2) self.budget = 0 def update_limit(self, params): - release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) - #self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0) + release_rate = ( + self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + # self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0) self.limit = 2.0 * release_rate self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time) @@ -202,15 +214,15 @@ class SmoothingBudgetLimiter(SmoothingLimiter): self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget self.proxy_model.results.budget[self.priority][params.time] = self.budget - #self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time)) + # self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time)) - #if self.smooth_filled.smooth_total(params.time) >= 0.1: - #self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time) + # if self.smooth_filled.smooth_total(params.time) >= 0.1: + # self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time) - #print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget)) + # print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget)) def can_start(self, params): - return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget + return params.num_started + params.count <= self.limit + self.budget # or params.num_started + params.count <= self.budget def update_budget(self, params): self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed) @@ -220,6 +232,7 @@ class SmoothingBudgetLimiter(SmoothingLimiter): self.smooth_released.add_delta(params.time, params.num_started_at_priority) + class ProxyModel: class Results: def __init__(self, priorities, duration): @@ -228,11 +241,11 @@ class ProxyModel: self.latencies = self.init_result(priorities, [], duration) self.unprocessed_queue_sizes = self.init_result(priorities, [], duration) - self.rate = {p:{} for p in priorities} - self.released = {p:{} for p in priorities} - self.limit = {p:{} for p in priorities} - self.limit_and_budget = {p:{} for p in priorities} - self.budget = {p:{} for p in priorities} + self.rate = {p: {} for p in priorities} + self.released = {p: {} for p in priorities} + self.limit = {p: {} for p in priorities} + self.limit_and_budget = {p: {} for p in priorities} + self.budget = {p: {} for p in priorities} def init_result(self, priorities, starting_value, duration): return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities} @@ -241,9 +254,10 @@ class ProxyModel: self.time = 0 self.log_time = 0 self.duration = duration - self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() } + self.priority_limiters = {priority: Limiter(priority, ratekeeper_model, self) for priority in + workload_model.priorities()} self.workload_model = workload_model - self.request_scheduled = { p: False for p in self.workload_model.priorities()} + self.request_scheduled = {p: False for p in self.workload_model.priorities()} self.tasks = [] self.request_queue = [] @@ -256,13 +270,14 @@ class ProxyModel: for priority in self.workload_model.priorities(): next_request = self.workload_model.next_request(self.time, priority) assert next_request is not None - heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + heapq.heappush(self.tasks, Task(next_request.time, + lambda next_request=next_request: self.receive_request(next_request))) self.request_scheduled[priority] = True - while True:# or len(self.request_queue) > 0: + while True: # or len(self.request_queue) > 0: if int(self.time) > self.log_time: self.log_time = int(self.time) - #print(self.log_time) + # print(self.log_time) task = heapq.heappop(self.tasks) self.time = task.time @@ -294,14 +309,15 @@ class ProxyModel: limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed)) current_started = 0 - started = {p:0 for p in self.workload_model.priorities()} + started = {p: 0 for p in self.workload_model.priorities()} min_priority = Priority.SYSTEM last_batch = 0 while len(self.request_queue) > 0: request = self.request_queue[0] - if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)): + if not self.priority_limiters[request.priority].can_start( + Limiter.CanStartParams(self.time, current_started, request.count)): break min_priority = request.priority @@ -310,7 +326,8 @@ class ProxyModel: if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]: next_request = self.workload_model.next_request(self.time, request.priority) assert next_request is not None - heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + heapq.heappush(self.tasks, Task(next_request.time, + lambda next_request=next_request: self.receive_request(next_request))) self.request_scheduled[request.priority] = True current_started += request.count @@ -318,21 +335,23 @@ class ProxyModel: heapq.heappop(self.request_queue) self.results.started[request.priority][int(self.time)] += request.count - self.results.latencies[request.priority][int(self.time)].append(self.time-request.time) + self.results.latencies[request.priority][int(self.time)].append(self.time - request.time) if len(self.request_queue) == 0: min_priority = Priority.BATCH for priority, limiter in self.priority_limiters.items(): - started_at_priority = sum([v for p,v in started.items() if p <= priority]) - limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed)) - + started_at_priority = sum([v for p, v in started.items() if p <= priority]) + limiter.update_budget( + Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, + len(self.request_queue) == 0 or self.request_queue[0].priority > priority, + elapsed)) + for priority in self.workload_model.priorities(): - self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding) + self.results.unprocessed_queue_sizes[priority][int(self.time)].append( + self.workload_model.workload_models[priority].outstanding) current_time = self.time delay = 0.001 heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time))) - - diff --git a/contrib/grv_proxy_model/rate_model.py b/contrib/grv_proxy_model/rate_model.py index 1fabce2c7e..41e7ee8f16 100755 --- a/contrib/grv_proxy_model/rate_model.py +++ b/contrib/grv_proxy_model/rate_model.py @@ -20,6 +20,7 @@ import numpy + class RateModel: def __init__(self): pass @@ -27,6 +28,7 @@ class RateModel: def get_rate(self, time): pass + class FixedRateModel(RateModel): def __init__(self, rate): RateModel.__init__(self) @@ -35,10 +37,12 @@ class FixedRateModel(RateModel): def get_rate(self, time): return self.rate + class UnlimitedRateModel(FixedRateModel): def __init__(self): self.rate = 1e9 + class IntervalRateModel(RateModel): def __init__(self, intervals): self.intervals = sorted(intervals) @@ -46,16 +50,17 @@ class IntervalRateModel(RateModel): def get_rate(self, time): if len(self.intervals) == 0 or time < self.intervals[0][0]: return 0 - - target_interval = len(self.intervals)-1 + + target_interval = len(self.intervals) - 1 for i in range(1, len(self.intervals)): if time < self.intervals[i][0]: - target_interval = i-1 + target_interval = i - 1 break self.intervals = self.intervals[target_interval:] return self.intervals[0][1] + class SawtoothRateModel(RateModel): def __init__(self, low, high, frequency): self.low = low @@ -63,11 +68,12 @@ class SawtoothRateModel(RateModel): self.frequency = frequency def get_rate(self, time): - if int(2*time/self.frequency) % 2 == 0: + if int(2 * time / self.frequency) % 2 == 0: return self.low else: return self.high + class DistributionRateModel(RateModel): def __init__(self, distribution, frequency): self.distribution = distribution diff --git a/contrib/grv_proxy_model/ratekeeper_model.py b/contrib/grv_proxy_model/ratekeeper_model.py index 57125dc4c0..96a5cff2bb 100755 --- a/contrib/grv_proxy_model/ratekeeper_model.py +++ b/contrib/grv_proxy_model/ratekeeper_model.py @@ -22,6 +22,7 @@ import numpy import rate_model from priority import Priority + class RatekeeperModel: def __init__(self, limit_models): self.limit_models = limit_models @@ -29,39 +30,40 @@ class RatekeeperModel: def get_limit(self, time, priority): return self.limit_models[priority].get_rate(time) + predefined_ratekeeper = {} predefined_ratekeeper['default200_batch100'] = RatekeeperModel( -{ - Priority.SYSTEM: rate_model.UnlimitedRateModel(), - Priority.DEFAULT: rate_model.FixedRateModel(200), - Priority.BATCH: rate_model.FixedRateModel(100) -}) + { + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(200), + Priority.BATCH: rate_model.FixedRateModel(100) + }) predefined_ratekeeper['default_sawtooth'] = RatekeeperModel( -{ - Priority.SYSTEM: rate_model.UnlimitedRateModel(), - Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1), - Priority.BATCH: rate_model.FixedRateModel(0) -}) + { + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1), + Priority.BATCH: rate_model.FixedRateModel(0) + }) predefined_ratekeeper['default_uniform_random'] = RatekeeperModel( -{ - Priority.SYSTEM: rate_model.UnlimitedRateModel(), - Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1), - Priority.BATCH: rate_model.FixedRateModel(0) -}) + { + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1), + Priority.BATCH: rate_model.FixedRateModel(0) + }) predefined_ratekeeper['default_trickle'] = RatekeeperModel( -{ - Priority.SYSTEM: rate_model.UnlimitedRateModel(), - Priority.DEFAULT: rate_model.FixedRateModel(3), - Priority.BATCH: rate_model.FixedRateModel(0) -}) + { + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(3), + Priority.BATCH: rate_model.FixedRateModel(0) + }) predefined_ratekeeper['default1000'] = RatekeeperModel( -{ - Priority.SYSTEM: rate_model.UnlimitedRateModel(), - Priority.DEFAULT: rate_model.FixedRateModel(1000), - Priority.BATCH: rate_model.FixedRateModel(500) -}) + { + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(1000), + Priority.BATCH: rate_model.FixedRateModel(500) + }) diff --git a/contrib/grv_proxy_model/smoother.py b/contrib/grv_proxy_model/smoother.py index bc1b32ea12..70473f3df0 100644 --- a/contrib/grv_proxy_model/smoother.py +++ b/contrib/grv_proxy_model/smoother.py @@ -20,6 +20,7 @@ import math + class Smoother: def __init__(self, folding_time): self.folding_time = folding_time @@ -28,10 +29,10 @@ class Smoother: def reset(self, value): self.time = 0 self.total = value - self.estimate = value - + self.estimate = value + def set_total(self, time, total): - self.add_delta(time, total-self.total) + self.add_delta(time, total - self.total) def add_delta(self, time, delta): self.update(time) @@ -43,11 +44,10 @@ class Smoother: def smooth_rate(self, time): self.update(time) - return (self.total-self.estimate) / self.folding_time + return (self.total - self.estimate) / self.folding_time def update(self, time): elapsed = time - self.time if elapsed > 0: self.time = time - self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time)) - + self.estimate += (self.total - self.estimate) * (1 - math.exp(-elapsed / self.folding_time)) diff --git a/contrib/grv_proxy_model/workload_model.py b/contrib/grv_proxy_model/workload_model.py index 63fb4c472e..2ec5aa4e63 100755 --- a/contrib/grv_proxy_model/workload_model.py +++ b/contrib/grv_proxy_model/workload_model.py @@ -25,6 +25,7 @@ import math import rate_model from priority import Priority + @functools.total_ordering class Request: def __init__(self, time, count, priority): @@ -35,6 +36,7 @@ class Request: def __lt__(self, other): return self.priority < other.priority + class PriorityWorkloadModel: def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9): self.priority = priority @@ -59,6 +61,7 @@ class PriorityWorkloadModel: return was_full and self.outstanding < self.max_outstanding + class WorkloadModel: def __init__(self, workload_models): self.workload_models = workload_models @@ -72,10 +75,17 @@ class WorkloadModel: def request_completed(self, request): return self.workload_models[request.priority].request_completed(request) + class Distribution: - EXPONENTIAL = lambda x: numpy.random.exponential(x) - UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x) - FIXED = lambda x: x + def exponential(x): + return numpy.random.exponential(x) + + def uniform(x): + return numpy.random.uniform(0, 2.0 * x) + + def fixed(x): + return x + class BatchGenerator: def __init__(self): @@ -84,6 +94,7 @@ class BatchGenerator: def next_batch(self): pass + class DistributionBatchGenerator(BatchGenerator): def __init__(self, distribution, size): BatchGenerator.__init__(self) @@ -93,6 +104,7 @@ class DistributionBatchGenerator(BatchGenerator): def next_batch(self): return math.ceil(self.distribution(self.size)) + class RequestGenerator: def __init__(self): pass @@ -100,6 +112,7 @@ class RequestGenerator: def next_request_interval(self, rate): pass + class DistributionRequestGenerator(RequestGenerator): def __init__(self, distribution): RequestGenerator.__init__(self) @@ -109,93 +122,94 @@ class DistributionRequestGenerator(RequestGenerator): if rate == 0: return 1e9 - return self.distribution(1.0/rate) + return self.distribution(1.0 / rate) + predefined_workloads = {} predefined_workloads['slow_exponential'] = WorkloadModel( -{ - Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, - rate_model.FixedRateModel(100), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.EXPONENTIAL), - max_outstanding=100 - ) -}) + { + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.exponential), + max_outstanding=100 + ) + }) predefined_workloads['fixed_uniform'] = WorkloadModel( -{ - Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, - rate_model.FixedRateModel(0), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=10 - ), - Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, - rate_model.FixedRateModel(95), - DistributionBatchGenerator(Distribution.FIXED, 10), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=200 - ), - Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, - rate_model.FixedRateModel(1), - DistributionBatchGenerator(Distribution.UNIFORM, 500), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=200 - ) -}) + { + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(95), + DistributionBatchGenerator(Distribution.fixed, 10), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.uniform, 500), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=200 + ) + }) predefined_workloads['batch_starvation'] = WorkloadModel( -{ - Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, - rate_model.FixedRateModel(1), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=10 - ), - Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, - rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=200 - ), - Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, - rate_model.FixedRateModel(100), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=200 - ) -}) + { + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0, 50), (60, 150), (120, 90)]), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=200 + ) + }) predefined_workloads['default_low_high_low'] = WorkloadModel( -{ - Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, - rate_model.FixedRateModel(0), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=10 - ), - Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, - rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=200 - ), - Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, - rate_model.FixedRateModel(0), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.UNIFORM), - max_outstanding=200 - ) -}) + { + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0, 100), (60, 300), (120, 100)]), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.uniform), + max_outstanding=200 + ) + }) for rate in [83, 100, 180, 190, 200]: predefined_workloads['default%d' % rate] = WorkloadModel( - { - Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, - rate_model.FixedRateModel(rate), - DistributionBatchGenerator(Distribution.FIXED, 1), - DistributionRequestGenerator(Distribution.EXPONENTIAL), - max_outstanding=1000 - ) - }) + { + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(rate), + DistributionBatchGenerator(Distribution.fixed, 1), + DistributionRequestGenerator(Distribution.exponential), + max_outstanding=1000 + ) + }) diff --git a/contrib/lsan.suppressions b/contrib/lsan.suppressions new file mode 100644 index 0000000000..3d3d40e59e --- /dev/null +++ b/contrib/lsan.suppressions @@ -0,0 +1,5 @@ +# LeakSanitizer suppressions file for FDB +# https://github.com/google/sanitizers/wiki/AddressSanitizerLeakSanitizer + +# Not all incoming connections are cleanly shut down in client API tests +leak:ConnectionReaderActorState diff --git a/contrib/monitoring/fdb_c_version.py b/contrib/monitoring/fdb_c_version.py index 059d1c7d9b..c56759370e 100755 --- a/contrib/monitoring/fdb_c_version.py +++ b/contrib/monitoring/fdb_c_version.py @@ -24,10 +24,12 @@ import sys import platform import os + def error(message): print(message) sys.exit(1) + def get_version_string(library_path): try: lib = ctypes.cdll.LoadLibrary(library_path) @@ -58,6 +60,7 @@ def get_version_string(library_path): return version_str + if __name__ == '__main__': if platform.system() == 'Linux': default_lib = 'libfdb_c.so' diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index 79534596b5..0d99e6ff39 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -28,7 +28,6 @@ optional packages: sortedcontainers (for estimating key range read/write density) """ - import argparse from collections import defaultdict from enum import Enum @@ -55,7 +54,6 @@ supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_ PROTOCOL_VERSION_6_2, PROTOCOL_VERSION_6_3, PROTOCOL_VERSION_7_0, PROTOCOL_VERSION_7_1, PROTOCOL_VERSION_7_2]) - fdb.api_version(520) BASIC_FORMAT = "%(asctime)s - %(levelname)-8s %(message)s" @@ -188,6 +186,7 @@ class BaseInfo(object): """ Corresponds to FdbClientLogEvents::Event """ + def __init__(self, bb, protocol_version): # we already read the EventType, so go straight to start_timestamp self.start_timestamp = bb.get_double() @@ -197,6 +196,7 @@ class BaseInfo(object): if bb.get_bool(): self.tenant = bb.get_bytes_with_length() + class GetVersionInfo(BaseInfo): def __init__(self, bb, protocol_version): super().__init__(bb, protocol_version) @@ -206,6 +206,7 @@ class GetVersionInfo(BaseInfo): if protocol_version >= PROTOCOL_VERSION_6_3: self.read_version = bb.get_long() + class GetInfo(BaseInfo): def __init__(self, bb, protocol_version): super().__init__(bb, protocol_version) @@ -244,11 +245,11 @@ class CommitInfo(BaseInfo): self.read_snapshot_version = bb.get_long() if protocol_version >= PROTOCOL_VERSION_6_3: self.report_conflicting_keys = bb.get_bool() - + if protocol_version >= PROTOCOL_VERSION_7_1: - lock_aware = bb.get_bool() + self.lock_aware = bb.get_bool() if bb.get_bool(): - spanId = bb.get_bytes(16) + self.spanId = bb.get_bytes(16) class ErrorGetInfo(BaseInfo): @@ -285,9 +286,9 @@ class ErrorCommitInfo(BaseInfo): self.report_conflicting_keys = bb.get_bool() if protocol_version >= PROTOCOL_VERSION_7_1: - lock_aware = bb.get_bool() + self.lock_aware = bb.get_bool() if bb.get_bool(): - spanId = bb.get_bytes(16) + self.spanId = bb.get_bytes(16) class UnsupportedProtocolVersionError(Exception): @@ -314,52 +315,57 @@ class ClientTransactionInfo: if event == 0: # we need to read it to consume the buffer even if we don't want to store it get_version = GetVersionInfo(bb, protocol_version) - if (not type_filter or "get_version" in type_filter): + if not type_filter or "get_version" in type_filter: self.get_version = get_version elif event == 1: get = GetInfo(bb, protocol_version) - if (not type_filter or "get" in type_filter): + if not type_filter or "get" in type_filter: # because of the crappy json serializtion using __dict__ we have to set the list here otherwise # it doesn't print - if not self.gets: self.gets = [] + if not self.gets: + self.gets = [] self.gets.append(get) elif event == 2: get_range = GetRangeInfo(bb, protocol_version) - if (not type_filter or "get_range" in type_filter): - if not self.get_ranges: self.get_ranges = [] + if not type_filter or "get_range" in type_filter: + if not self.get_ranges: + self.get_ranges = [] self.get_ranges.append(get_range) elif event == 3: commit = CommitInfo(bb, protocol_version, full_output=full_output) - if (not type_filter or "commit" in type_filter): + if not type_filter or "commit" in type_filter: self.commit = commit elif event == 4: error_get = ErrorGetInfo(bb, protocol_version) - if (not type_filter or "error_gets" in type_filter): - if not self.error_gets: self.error_gets = [] + if not type_filter or "error_gets" in type_filter: + if not self.error_gets: + self.error_gets = [] self.error_gets.append(error_get) elif event == 5: error_get_range = ErrorGetRangeInfo(bb, protocol_version) - if (not type_filter or "error_get_range" in type_filter): - if not self.error_get_ranges: self.error_get_ranges = [] + if not type_filter or "error_get_range" in type_filter: + if not self.error_get_ranges: + self.error_get_ranges = [] self.error_get_ranges.append(error_get_range) elif event == 6: error_commit = ErrorCommitInfo(bb, protocol_version, full_output=full_output) - if (not type_filter or "error_commit" in type_filter): - if not self.error_commits: self.error_commits = [] + if not type_filter or "error_commit" in type_filter: + if not self.error_commits: + self.error_commits = [] self.error_commits.append(error_commit) else: raise Exception("Unknown event type %d" % event) def has_types(self): - return self.get_version or self.gets or self.get_ranges or self.commit or self.error_gets \ - or self.error_get_ranges or self.error_commits + return self.get_version or self.gets or self.get_ranges or self.commit \ + or self.error_gets or self.error_get_ranges or self.error_commits def to_json(self): return json.dumps(self, cls=ObjJsonEncoder, sort_keys=True) class TransactionInfoLoader(object): - max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size + max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size def __init__(self, db, full_output=True, type_filter=None, min_timestamp=None, max_timestamp=None): self.db = db @@ -433,7 +439,7 @@ class TransactionInfoLoader(object): reverse = False for k, v in tr.snapshot.get_range(start_key, end_key, limit=1, reverse=reverse): return fdb.tuple.unpack(v)[0] - return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range + return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range def fetch_transaction_info(self): if self.min_timestamp: @@ -469,12 +475,12 @@ class TransactionInfoLoader(object): streaming_mode=fdb.impl.StreamingMode.want_all) for k, v in transaction_info_range: found += 1 - #logger.debug(k) + # logger.debug(k) start_key = fdb.KeySelector.first_greater_than(k) _, tr_id, num_chunks, chunk_num = self.parse_key(k) - #logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num)) + # logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num)) if num_chunks == 1: assert chunk_num == 1 @@ -482,7 +488,7 @@ class TransactionInfoLoader(object): info = build_client_transaction_info(v) if info.has_types(): buffer.append(info) - except UnsupportedProtocolVersionError as e: + except UnsupportedProtocolVersionError: invalid_transaction_infos += 1 except ValueError: invalid_transaction_infos += 1 @@ -497,7 +503,8 @@ class TransactionInfoLoader(object): self._check_and_adjust_chunk_cache_size() else: if tr_id not in self.tr_info_map: - logger.error("Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id) + logger.error( + "Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id) continue c_list = self.tr_info_map[tr_id] if c_list[-1].num_chunks != num_chunks or c_list[-1].chunk_num != chunk_num - 1: @@ -513,7 +520,7 @@ class TransactionInfoLoader(object): info = build_client_transaction_info(b''.join([chunk.value for chunk in c_list])) if info.has_types(): buffer.append(info) - except UnsupportedProtocolVersionError as e: + except UnsupportedProtocolVersionError: invalid_transaction_infos += 1 except ValueError: invalid_transaction_infos += 1 @@ -553,6 +560,7 @@ def has_dateparser(): logger.warn("Can't find dateparser so disabling human date parsing") return False + class ReadCounter(object): def __init__(self): from sortedcontainers import SortedDict @@ -560,7 +568,7 @@ class ReadCounter(object): self.reads[b''] = [0, 0] self.read_counts = {} - self.hit_count=0 + self.hit_count = 0 def process(self, transaction_info): for get in transaction_info.gets: @@ -576,7 +584,7 @@ class ReadCounter(object): if end_key is not None: self.reads.setdefault(end_key, [0, 0])[1] += 1 else: - self.reads.setdefault(start_key+b'\x00', [0, 0])[1] += 1 + self.reads.setdefault(start_key + b'\x00', [0, 0])[1] += 1 def get_total_reads(self): return sum([v for v in self.read_counts.values()]) @@ -673,8 +681,8 @@ class ShardFinder(object): self.shard_cache = {} def _get_boundary_keys(self, begin, end): - start_pos = max(0, bisect_right(self.boundary_keys, begin)-1) - end_pos = max(0, bisect_right(self.boundary_keys, end)-1) + start_pos = max(0, bisect_right(self.boundary_keys, begin) - 1) + end_pos = max(0, bisect_right(self.boundary_keys, end) - 1) return self.boundary_keys[start_pos:end_pos] @@ -691,9 +699,9 @@ class ShardFinder(object): return len(self._get_boundary_keys(start_key, end_key)) + 1 def get_addresses_for_key(self, key): - shard = self.boundary_keys[max(0, bisect_right(self.boundary_keys, key)-1)] + shard = self.boundary_keys[max(0, bisect_right(self.boundary_keys, key) - 1)] do_load = False - if not shard in self.shard_cache: + if shard not in self.shard_cache: do_load = True elif self.shard_cache[shard].is_ready(): try: @@ -708,7 +716,7 @@ class ShardFinder(object): for f in self.outstanding: try: f.wait() - except fdb.FDBError as e: + except fdb.FDBError: pass self.outstanding = [] @@ -726,10 +734,13 @@ class ShardFinder(object): if item[addr_idx] is not None: while True: try: - ranges[index] = item[0:addr_idx] + ([a.decode('ascii') for a in item[addr_idx].wait()],) + item[addr_idx+1:] + ranges[index] = item[0:addr_idx] + ([a.decode('ascii') for a in item[addr_idx].wait()],) \ + + item[addr_idx + 1:] break - except fdb.FDBError as e: - ranges[index] = item[0:addr_idx] + (self.get_addresses_for_key(item[key_idx]),) + item[addr_idx+1:] + except fdb.FDBError: + ranges[index] = item[0:addr_idx] + (self.get_addresses_for_key(item[key_idx]),) \ + + item[addr_idx + 1:] + class WriteCounter(object): mutation_types_to_consider = frozenset([MutationType.SET_VALUE, MutationType.ADD_VALUE]) @@ -795,10 +806,11 @@ class WriteCounter(object): filter_addresses = set(filter_addresses) results = [r for r in results if filter_addresses.issubset(set(r[3]))][0:num] else: - results = [(key, end, count) for (count, key) in count_pairs[0:num]] + results = [(key, None, count) for (count, key) in count_pairs[0:num]] return results + def connect(cluster_file=None): db = fdb.open(cluster_file=cluster_file) return db @@ -831,22 +843,34 @@ def main(): end_time_group = parser.add_mutually_exclusive_group() end_time_group.add_argument("--max-timestamp", type=int, help="Don't return events newer than this epoch time") end_time_group.add_argument("-e", "--end-time", type=str, help="Don't return events older than this parsed time") - parser.add_argument("--num-buckets", type=int, help="The number of buckets to partition the key-space into for operation counts", default=100) - parser.add_argument("--top-requests", type=int, help="If specified will output this many top keys for reads or writes", default=0) - parser.add_argument("--exclude-ports", action="store_true", help="Print addresses without the port number. Only works in versions older than 6.3, and is required in versions older than 6.2.") - parser.add_argument("--single-shard-ranges-only", action="store_true", help="Only print range boundaries that exist in a single shard") - parser.add_argument("-a", "--filter-address", action="append", help="Only print range boundaries that include the given address. This option can used multiple times to include more than one address in the filter, in which case all addresses must match.") + parser.add_argument("--num-buckets", type=int, + help="The number of buckets to partition the key-space into for operation counts", default=100) + parser.add_argument("--top-requests", type=int, + help="If specified will output this many top keys for reads or writes", default=0) + parser.add_argument("--exclude-ports", action="store_true", + help="Print addresses without the port number. Only works in versions older than 6.3, and is required in versions older than 6.2.") + parser.add_argument("--single-shard-ranges-only", action="store_true", + help="Only print range boundaries that exist in a single shard") + parser.add_argument("-a", "--filter-address", action="append", + help="Only print range boundaries that include the given address. This option can used multiple times to include more than one address in the filter, in which case all addresses must match.") args = parser.parse_args() type_filter = set() - if args.filter_get_version: type_filter.add("get_version") - if args.filter_get or args.filter_reads: type_filter.add("get") - if args.filter_get_range or args.filter_reads: type_filter.add("get_range") - if args.filter_commit: type_filter.add("commit") - if args.filter_error_get: type_filter.add("error_get") - if args.filter_error_get_range: type_filter.add("error_get_range") - if args.filter_error_commit: type_filter.add("error_commit") + if args.filter_get_version: + type_filter.add("get_version") + if args.filter_get or args.filter_reads: + type_filter.add("get") + if args.filter_get_range or args.filter_reads: + type_filter.add("get_range") + if args.filter_commit: + type_filter.add("commit") + if args.filter_error_get: + type_filter.add("error_get") + if args.filter_error_get_range: + type_filter.add("error_get_range") + if args.filter_error_commit: + type_filter.add("error_commit") if (not type_filter or "commit" in type_filter): write_counter = WriteCounter() if args.num_buckets else None @@ -912,7 +936,8 @@ def main(): else: op_str = 'Key %r' % start - print(" %d. %s\n %d sampled %s (%.2f%%, %.2f%% cumulative)" % (idx+1, op_str, count, context, 100*count/total, 100*running_count/total)) + print(" %d. %s\n %d sampled %s (%.2f%%, %.2f%% cumulative)" % ( + idx + 1, op_str, count, context, 100 * count / total, 100 * running_count / total)) print(" shard addresses: %s\n" % ", ".join(addresses)) else: @@ -933,10 +958,10 @@ def main(): if not omit: if omit_start is not None: - if omit_start == idx-1: + if omit_start == idx - 1: print(" %d. Omitted\n" % (idx)) else: - print(" %d - %d. Omitted\n" % (omit_start+1, idx)) + print(" %d - %d. Omitted\n" % (omit_start + 1, idx)) omit_start = None if total_count is None: @@ -944,18 +969,19 @@ def main(): else: count_str = '%d sampled %s (%d intersecting)' % (start_count, context, total_count) if not shard_count: - print(" %d. [%s, %s]\n %d sampled %s\n" % (idx+1, start, end, count, context)) + print(" %d. [%s, %s]\n %s\n" % (idx + 1, start, end, count_str)) else: addresses_string = "; addresses=%s" % ', '.join(addresses) if addresses else '' - print(" %d. [%s, %s]\n %s spanning %d shard(s)%s\n" % (idx+1, start, end, count_str, shard_count, addresses_string)) + print(" %d. [%s, %s]\n %s spanning %d shard(s)%s\n" % ( + idx + 1, start, end, count_str, shard_count, addresses_string)) elif omit_start is None: omit_start = idx if omit_start is not None: - if omit_start == len(range_boundaries)-1: + if omit_start == len(range_boundaries) - 1: print(" %d. Omitted\n" % len(range_boundaries)) else: - print(" %d - %d. Omitted\n" % (omit_start+1, len(range_boundaries))) + print(" %d - %d. Omitted\n" % (omit_start + 1, len(range_boundaries))) shard_finder = ShardFinder(db, args.exclude_ports) @@ -963,7 +989,8 @@ def main(): if write_counter: if args.top_requests: - top_writes = write_counter.get_top_k_writes(args.top_requests, args.filter_address, shard_finder=shard_finder) + top_writes = write_counter.get_top_k_writes(args.top_requests, args.filter_address, + shard_finder=shard_finder) range_boundaries = write_counter.get_range_boundaries(args.num_buckets, shard_finder=shard_finder) num_writes = write_counter.get_total_writes() @@ -1014,5 +1041,6 @@ def main(): print("Key-space boundaries with approximately equal read counts:\n") print_range_boundaries(range_boundaries, "reads") + if __name__ == "__main__": main() diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer_tests.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer_tests.py index 9b90ef1c70..eed8108782 100755 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer_tests.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer_tests.py @@ -105,8 +105,8 @@ class RangeCounterTest(unittest.TestCase): assert rc_count == v, "Counts for %s mismatch. Expected %d got %d" % (k, v, rc_count) for _ in range(0, 100): - i = random.randint(0, len(letters)-1) - j = random.randint(0, len(letters)-2) + i = random.randint(0, len(letters) - 1) + j = random.randint(0, len(letters) - 2) if i == j: j += 1 start_index = min(i, j) @@ -123,4 +123,4 @@ class RangeCounterTest(unittest.TestCase): if __name__ == "__main__": - unittest.main() # run all tests + unittest.main() # run all tests diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst index 95126a5711..2e130e6948 100644 --- a/documentation/sphinx/source/client-testing.rst +++ b/documentation/sphinx/source/client-testing.rst @@ -321,7 +321,7 @@ and pass the test with ``-f``: Running a Workload on an actual Cluster ======================================= -Running a workload on a cluster works basically the smae way. However, one must +Running a workload on a cluster works basically the same way. However, one must actually setup a cluster first. This cluster must run between one and many server processes with the class test. So above 2-step process becomes a bit more complex: diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 4efbfa32c2..70c487be0b 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -890,8 +890,18 @@ } } }, - "tenants":{ - "num_tenants":0 + "metacluster" : { + "cluster_type" : "management", // management, data, or standalone + "metacluster_name" : "metacluster1", + "metacluster_id" : 12345, + "data_cluster_name" : "data_cluster1", // data cluster only + "data_cluster_id" : 12346, // data cluster only + "num_data_clusters": 10 // management cluster only + }, + "tenants" : { + "num_tenants" : 1, // on data cluster, local count; on management cluster, total metacluster count + "num_tenant_groups" : 10, + "tenant_group_capacity" : 20, } }, "client":{ diff --git a/fdbcli/BlobRestoreCommand.actor.cpp b/fdbcli/BlobRestoreCommand.actor.cpp index fad60d7d74..5738782602 100644 --- a/fdbcli/BlobRestoreCommand.actor.cpp +++ b/fdbcli/BlobRestoreCommand.actor.cpp @@ -36,7 +36,8 @@ ACTOR Future blobRestoreCommandActor(Database localDb, std::vectorblobRestore(normalKeys))); if (success) { - fmt::print("Started blob restore for the full cluster. Please use 'status' command to check progress.\n"); + fmt::print( + "Started blob restore for the full cluster. Please use 'status details' command to check progress.\n"); } else { fmt::print("Fail to start a new blob restore while there is a pending one.\n"); } diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp index 26a3da9876..cf1b0b4b73 100644 --- a/fdbcli/ConfigureCommand.actor.cpp +++ b/fdbcli/ConfigureCommand.actor.cpp @@ -326,7 +326,7 @@ CommandFactory configureFactory( "count=|perpetual_storage_wiggle=|perpetual_storage_wiggle_locality=" "<:|0>|storage_migration_type={disabled|gradual|aggressive}" "|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}" - "|encryption_at_rest_mode={disabled|aes_256_ctr}", + "|encryption_at_rest_mode={disabled|domain_aware|cluster_aware}", "change the database configuration", "The `new' option, if present, initializes a new database with the given configuration rather than changing " "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be " @@ -360,7 +360,8 @@ CommandFactory configureFactory( "tenant_mode=: Sets the tenant mode for the cluster. If " "optional, then transactions can be run with or without specifying tenants. If required, all data must be " "accessed using tenants.\n\n" - "encryption_at_rest_mode=: Sets the cluster encryption data at-rest support for the " + "encryption_at_rest_mode=: Sets the cluster encryption data at-rest " + "support for the " "database. The configuration can be updated ONLY at the time of database creation and once set can't be " "updated for the lifetime of the database.\n\n" diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp index 494da41c38..03f523e489 100644 --- a/fdbcli/StatusCommand.actor.cpp +++ b/fdbcli/StatusCommand.actor.cpp @@ -1125,6 +1125,15 @@ void printStatus(StatusObjectReader statusObj, outputString += "\n Number of Workers - " + format("%d", numWorkers); auto numKeyRanges = statusObjBlobGranules["number_of_key_ranges"].get_int(); outputString += "\n Number of Key Ranges - " + format("%d", numKeyRanges); + if (statusObjCluster.has("blob_restore")) { + StatusObjectReader statusObjBlobRestore = statusObjCluster["blob_restore"]; + std::string restoreStatus = statusObjBlobRestore["blob_full_restore_phase"].get_str(); + if (statusObjBlobRestore.has("blob_full_restore_progress")) { + auto progress = statusObjBlobRestore["blob_full_restore_progress"].get_int(); + restoreStatus += " " + format("%d%%", progress); + } + outputString += "\n Full Restore - " + restoreStatus; + } } } diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 18abda3702..fb0680fb31 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -294,6 +294,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60; init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59; init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( CLIENT_ENABLE_USING_CLUSTER_ID_KEY, false ); init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING, false ); // clang-format on diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index e2ae1142ed..ca18a35df4 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -206,10 +206,12 @@ std::map configForToken(std::string const& mode) { EncryptionAtRestMode mode; if (value == "disabled") { mode = EncryptionAtRestMode::DISABLED; - } else if (value == "aes_256_ctr") { - mode = EncryptionAtRestMode::AES_256_CTR; + } else if (value == "domain_aware") { + mode = EncryptionAtRestMode::DOMAIN_AWARE; + } else if (value == "cluster_aware") { + mode = EncryptionAtRestMode::CLUSTER_AWARE; } else { - printf("Error: Only disabled|aes_256_ctr are valid for encryption_at_rest_mode.\n"); + printf("Error: Only disabled|domain_aware|cluster_aware are valid for encryption_at_rest_mode.\n"); return out; } out[p + key] = format("%d", mode); @@ -465,6 +467,168 @@ bool isCompleteConfiguration(std::map const& options) options.count(p + "storage_engine") == 1; } +/* + - Validates encryption and tenant mode configurations + - During cluster creation (configure new) we allow the following: + - If encryption mode is disabled/cluster_aware then any tenant mode is allowed + - If the encryption mode is domain_aware then the only allowed tenant mode is required + - During cluster configuration changes the following is allowed: + - Encryption mode cannot be changed (can only be set during creation) + - If the encryption mode is disabled/cluster_aware then any tenant mode changes are allowed + - If the encryption mode is domain_aware then tenant mode changes are not allowed (as the only supported mode is + required) +*/ +bool isEncryptionAtRestModeConfigValid(Optional oldConfiguration, + std::map newConfig, + bool creating) { + EncryptionAtRestMode encryptMode; + TenantMode tenantMode; + if (creating) { + if (newConfig.count(encryptionAtRestModeConfKey.toString()) != 0) { + encryptMode = EncryptionAtRestMode::fromValueRef( + ValueRef(newConfig.find(encryptionAtRestModeConfKey.toString())->second)); + // check if the tenant mode is being set during configure new (otherwise assume tenants are disabled) + if (newConfig.count(tenantModeConfKey.toString()) != 0) { + tenantMode = TenantMode::fromValue(ValueRef(newConfig.find(tenantModeConfKey.toString())->second)); + } + } + } else { + ASSERT(oldConfiguration.present()); + encryptMode = oldConfiguration.get().encryptionAtRestMode; + if (newConfig.count(tenantModeConfKey.toString()) != 0) { + tenantMode = TenantMode::fromValue(ValueRef(newConfig.find(tenantModeConfKey.toString())->second)); + } else { + // Tenant mode and encryption mode didn't change + return true; + } + } + TraceEvent(SevDebug, "EncryptAndTenantModes") + .detail("EncryptMode", encryptMode.toString()) + .detail("TenantMode", tenantMode.toString()); + + if (encryptMode.mode == EncryptionAtRestMode::DOMAIN_AWARE && tenantMode != TenantMode::REQUIRED) { + // For domain aware encryption only the required tenant mode is currently supported + TraceEvent(SevWarnAlways, "InvalidEncryptAndTenantConfiguration") + .detail("EncryptMode", encryptMode.toString()) + .detail("TenantMode", tenantMode.toString()); + return false; + } + + return true; +} + +bool isTenantModeModeConfigValid(DatabaseConfiguration oldConfiguration, DatabaseConfiguration newConfiguration) { + TenantMode oldTenantMode = oldConfiguration.tenantMode; + TenantMode newTenantMode = newConfiguration.tenantMode; + TraceEvent(SevDebug, "TenantModes") + .detail("OldTenantMode", oldTenantMode.toString()) + .detail("NewTenantMode", newTenantMode.toString()); + if (oldTenantMode != TenantMode::REQUIRED && newTenantMode == TenantMode::REQUIRED) { + // TODO: Changing from optional/disabled to required tenant mode should be allowed if there is no non-tenant + // data present + TraceEvent(SevWarnAlways, "InvalidTenantConfiguration") + .detail("OldTenantMode", oldTenantMode.toString()) + .detail("NewTenantMode", newTenantMode.toString()); + return false; + } + return true; +} + +TEST_CASE("/ManagementAPI/ChangeConfig/TenantMode") { + DatabaseConfiguration oldConfig; + DatabaseConfiguration newConfig; + std::vector tenantModes = { TenantMode::DISABLED, TenantMode::OPTIONAL_TENANT, TenantMode::REQUIRED }; + // required tenant mode can change to any other tenant mode + oldConfig.tenantMode = TenantMode::REQUIRED; + newConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes); + ASSERT(isTenantModeModeConfigValid(oldConfig, newConfig)); + // optional/disabled tenant mode can switch to optional/disabled tenant mode + oldConfig.tenantMode = deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT; + newConfig.tenantMode = deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT; + ASSERT(isTenantModeModeConfigValid(oldConfig, newConfig)); + // optional/disabled tenant mode CANNOT switch to required tenant mode + oldConfig.tenantMode = deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT; + newConfig.tenantMode = TenantMode::REQUIRED; + ASSERT(!isTenantModeModeConfigValid(oldConfig, newConfig)); + + return Void(); +} + +// unit test for changing encryption/tenant mode config options +TEST_CASE("/ManagementAPI/ChangeConfig/TenantAndEncryptMode") { + std::map newConfig; + std::string encryptModeKey = encryptionAtRestModeConfKey.toString(); + std::string tenantModeKey = tenantModeConfKey.toString(); + std::vector tenantModes = { TenantMode::DISABLED, TenantMode::OPTIONAL_TENANT, TenantMode::REQUIRED }; + std::vector encryptionModes = { EncryptionAtRestMode::DISABLED, + EncryptionAtRestMode::CLUSTER_AWARE, + EncryptionAtRestMode::DOMAIN_AWARE }; + // configure new test cases + + // encryption disabled checks + newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::DISABLED); + newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes)); + ASSERT(isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + + // cluster aware encryption checks + newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::CLUSTER_AWARE); + newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes)); + ASSERT(isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + + // domain aware encryption checks + newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::DOMAIN_AWARE); + newConfig[tenantModeKey] = + std::to_string(deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT); + ASSERT(!isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + newConfig[tenantModeKey] = std::to_string(TenantMode::REQUIRED); + ASSERT(isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + + // no encrypt mode present + newConfig.erase(encryptModeKey); + newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes)); + ASSERT(isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + + // no tenant mode present + newConfig.erase(tenantModeKey); + newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::DOMAIN_AWARE); + ASSERT(!isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + newConfig[encryptModeKey] = std::to_string(EncryptionAtRestMode::CLUSTER_AWARE); + ASSERT(isEncryptionAtRestModeConfigValid(Optional(), newConfig, true)); + + // change config test cases + DatabaseConfiguration oldConfig; + + // encryption disabled checks + oldConfig.encryptionAtRestMode = EncryptionAtRestMode::DISABLED; + oldConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes); + newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes)); + ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false)); + + // domain aware encryption checks + oldConfig.encryptionAtRestMode = EncryptionAtRestMode::DOMAIN_AWARE; + oldConfig.tenantMode = TenantMode::REQUIRED; + newConfig[tenantModeKey] = + std::to_string(deterministicRandom()->coinflip() ? TenantMode::DISABLED : TenantMode::OPTIONAL_TENANT); + ASSERT(!isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false)); + newConfig[tenantModeKey] = std::to_string(TenantMode::REQUIRED); + ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false)); + + // cluster aware encryption checks + oldConfig.encryptionAtRestMode = EncryptionAtRestMode::CLUSTER_AWARE; + // required tenant mode can switch to any other tenant mode with cluster aware encryption + oldConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes); + newConfig[tenantModeKey] = std::to_string(deterministicRandom()->randomChoice(tenantModes)); + ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false)); + + // no tenant mode present + newConfig.erase(tenantModeKey); + oldConfig.tenantMode = deterministicRandom()->randomChoice(tenantModes); + oldConfig.encryptionAtRestMode = deterministicRandom()->randomChoice(encryptionModes); + ASSERT(isEncryptionAtRestModeConfigValid(oldConfig, newConfig, false)); + + return Void(); +} + ACTOR Future getDatabaseConfiguration(Transaction* tr) { tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -962,6 +1126,14 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, if (!disableConfigDB) { wait(verifyConfigurationDatabaseAlive(tr->getDatabase())); } + if (BUGGIFY_WITH_PROB(0.1)) { + // Introduce a random delay in simulation to allow processes to be + // killed before previousCoordinatorKeys has been reset. This will + // help test scenarios where the previous configuration database + // state has been transferred to the new coordinators but the + // broadcaster thinks it has not been transferred. + wait(delay(deterministicRandom()->random01() * 10)); + } wait(resetPreviousCoordinatorsKey(tr->getDatabase())); return CoordinatorsResult::SAME_NETWORK_ADDRESSES; } diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index d351bc29d2..a353e22e63 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -1548,17 +1548,19 @@ ThreadFuture MultiVersionTransaction::onError(Error const& e) { auto f = tr.transaction ? tr.transaction->onError(e) : makeTimeout(); f = abortableFuture(f, tr.onChange); - return flatMapThreadFuture(f, [this, e](ErrorOr ready) { - if (!ready.isError() || ready.getError().code() != error_code_cluster_version_changed) { - if (ready.isError()) { - return ErrorOr>(ready.getError()); - } - + return flatMapThreadFuture(f, [this](ErrorOr ready) { + if (ready.isError() && ready.getError().code() == error_code_cluster_version_changed) { + // In case of a cluster version change, upgrade (or downgrade) the transaction + // and let it to be retried independently of the original error + updateTransaction(); + return ErrorOr>(Void()); + } + // In all other cases forward the result of the inner onError call + if (ready.isError()) { + return ErrorOr>(ready.getError()); + } else { return ErrorOr>(Void()); } - - updateTransaction(); - return ErrorOr>(onError(e)); }); } } @@ -2968,7 +2970,7 @@ ACTOR Future updateClusterSharedStateMapImpl(MultiVersionApi* self, // The cluster ID will be the connection record string (either a filename or the connection string itself) // in versions before we could read the cluster ID. state std::string clusterId = connectionRecord.toString(); - if (dbProtocolVersion.hasClusterIdSpecialKey()) { + if (CLIENT_KNOBS->CLIENT_ENABLE_USING_CLUSTER_ID_KEY && dbProtocolVersion.hasClusterIdSpecialKey()) { state Reference tr = db->createTransaction(); loop { try { diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 7e1a64cb25..1534b1d8e9 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -7110,11 +7110,11 @@ ACTOR Future readVersionBatcher(DatabaseContext* cx, state Reference batchIntervalDist = Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvBatchInterval"_sr, - Histogram::Unit::microseconds, + Histogram::Unit::milliseconds, 0, CLIENT_KNOBS->GRV_BATCH_TIMEOUT * 1000000 * 2); state Reference grvReplyLatencyDist = - Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvReplyLatency"_sr, Histogram::Unit::microseconds); + Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvReplyLatency"_sr, Histogram::Unit::milliseconds); state double lastRequestTime = now(); state TransactionTagMap tags; @@ -10732,12 +10732,13 @@ ACTOR Future purgeBlobGranulesActor(Reference db, // must be aligned to blob range(s) state Future>> blobbifiedBegin = - getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2); + getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, keyAfter(purgeRange.begin)), 1); state Future>> blobbifiedEnd = - getBlobRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2); + getBlobRanges(&tr, KeyRangeRef(purgeRange.end, keyAfter(purgeRange.end)), 1); wait(success(blobbifiedBegin) && success(blobbifiedEnd)); + // If there are no blob ranges on the boundary that's okay as we allow purging of multiple full ranges. if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) || - (!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) { + (!blobbifiedEnd.get().empty() && blobbifiedEnd.get().front().begin < purgeRange.end)) { TraceEvent("UnalignedPurge") .detail("Range", range) .detail("Version", purgeVersion) @@ -10941,8 +10942,7 @@ ACTOR Future blobRestoreActor(Reference cx, KeyRange rang return false; // stop if there is in-progress restore. } } - Standalone status; - status.progress = 0; + BlobRestoreStatus status(BlobRestorePhase::INIT); Value newValue = blobRestoreCommandValueFor(status); tr->set(key, newValue); wait(tr->commit()); diff --git a/fdbclient/PaxosConfigTransaction.actor.cpp b/fdbclient/PaxosConfigTransaction.actor.cpp index e2068636dd..a1604cb5fb 100644 --- a/fdbclient/PaxosConfigTransaction.actor.cpp +++ b/fdbclient/PaxosConfigTransaction.actor.cpp @@ -218,8 +218,12 @@ class GetGenerationQuorum { if (self->coordinatorsChangedFuture.isReady()) { throw coordinators_changed(); } - wait(delayJittered(std::clamp( - 0.005 * (1 << std::min(retries, 30)), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND))); + if (deterministicRandom()->random01() < 0.95) { + // Add some random jitter to prevent clients from + // contending. + wait(delayJittered(std::clamp( + 0.006 * (1 << std::min(retries, 30)), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND))); + } if (deterministicRandom()->random01() < 0.05) { // Randomly inject a delay of at least the generation // reply timeout, to try to prevent contention between diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index c902ab309d..efe3606f7d 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -855,7 +855,8 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema( "encryption_at_rest_mode": { "$enum":[ "disabled", - "aes_256_ctr" + "domain_aware", + "cluster_aware" ]} }, "consistency_scan_info":{ @@ -963,11 +964,18 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema( } } }, - "tenants":{ - "num_tenants":0 - }, "metacluster" : { - "cluster_type" : "standalone" + "cluster_type" : "management", + "metacluster_name":"metacluster1", + "metacluster_id":12345, + "data_cluster_name" : "data_cluster1", + "data_cluster_id" : 12346, + "num_data_clusters":10 + }, + "tenants":{ + "num_tenants":0, + "num_tenant_groups":10, + "tenant_group_capacity":20 } }, "client":{ diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index ed283b7b41..eb818d9789 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -301,6 +301,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( TENANT_CACHE_STORAGE_USAGE_TRACE_INTERVAL, 300 ); init( CP_FETCH_TENANTS_OVER_STORAGE_QUOTA_INTERVAL, 5 ); if( randomize && BUGGIFY ) CP_FETCH_TENANTS_OVER_STORAGE_QUOTA_INTERVAL = deterministicRandom()->randomInt(1, 10); // TeamRemover @@ -390,19 +391,22 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // If true, do not process and store RocksDB logs init( ROCKSDB_MUTE_LOGS, true ); // Use a smaller memtable in simulation to avoid OOMs. - int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024; + int64_t memtableBytes = isSimulated ? 1024 * 1024 : 512 * 1024 * 1024; init( ROCKSDB_MEMTABLE_BYTES, memtableBytes ); init( ROCKSDB_LEVEL_STYLE_COMPACTION, true ); init( ROCKSDB_UNSAFE_AUTO_FSYNC, false ); init( ROCKSDB_PERIODIC_COMPACTION_SECONDS, 0 ); init( ROCKSDB_PREFIX_LEN, 0 ); // If rocksdb block cache size is 0, the default 8MB is used. - int64_t blockCacheSize = isSimulated ? 0 : 1024 * 1024 * 1024 /* 1GB */; + int64_t blockCacheSize = isSimulated ? 16 * 1024 * 1024 : 1024 * 1024 * 1024 /* 1GB */; init( ROCKSDB_BLOCK_CACHE_SIZE, blockCacheSize ); init( ROCKSDB_METRICS_DELAY, 60.0 ); - init( ROCKSDB_READ_VALUE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); - init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 ); - init( ROCKSDB_READ_RANGE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + // ROCKSDB_READ_VALUE_TIMEOUT, ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, ROCKSDB_READ_RANGE_TIMEOUT knobs: + // In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have + // very high load and single read thread cannot process all the load within the timeouts. + init( ROCKSDB_READ_VALUE_TIMEOUT, 5.0 ); if (isSimulated) ROCKSDB_READ_VALUE_TIMEOUT = 5 * 60; + init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, 5.0 ); if (isSimulated) ROCKSDB_READ_VALUE_PREFIX_TIMEOUT = 5 * 60; + init( ROCKSDB_READ_RANGE_TIMEOUT, 5.0 ); if (isSimulated) ROCKSDB_READ_RANGE_TIMEOUT = 5 * 60; init( ROCKSDB_READ_QUEUE_WAIT, 1.0 ); init( ROCKSDB_READ_QUEUE_HARD_MAX, 1000 ); init( ROCKSDB_READ_QUEUE_SOFT_MAX, 500 ); @@ -436,6 +440,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB init( ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip(); // ROCKSDB_STATS_LEVEL=1 indicates rocksdb::StatsLevel::kExceptHistogramOrTimers + // Refer StatsLevel: https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h#L594 init( ROCKSDB_STATS_LEVEL, 1 ); if( randomize && BUGGIFY ) ROCKSDB_STATS_LEVEL = deterministicRandom()->randomInt(0, 6); // Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for // ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded. @@ -555,7 +560,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BACKUP_TIMEOUT, 0.4 ); init( BACKUP_NOOP_POP_DELAY, 5.0 ); init( BACKUP_FILE_BLOCK_BYTES, 1024 * 1024 ); - init( BACKUP_LOCK_BYTES, 3e9 ); if(randomize && BUGGIFY) BACKUP_LOCK_BYTES = deterministicRandom()->randomInt(1024, 4096) * 15 * 1024; + init( BACKUP_LOCK_BYTES, 3e9 ); if(randomize && BUGGIFY) BACKUP_LOCK_BYTES = deterministicRandom()->randomInt(1024, 4096) * 30 * 1024; init( BACKUP_UPLOAD_DELAY, 10.0 ); if(randomize && BUGGIFY) BACKUP_UPLOAD_DELAY = deterministicRandom()->random01() * 60; //Cluster Controller diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index e50c18fffe..c01424b940 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -876,6 +876,7 @@ const KeyRef triggerDDTeamInfoPrintKey("\xff/triggerDDTeamInfoPrint"_sr); const KeyRef consistencyScanInfoKey = "\xff/consistencyScanInfo"_sr; const KeyRef encryptionAtRestModeConfKey("\xff/conf/encryption_at_rest_mode"_sr); +const KeyRef tenantModeConfKey("\xff/conf/tenant_mode"_sr); const KeyRangeRef excludedServersKeys("\xff/conf/excluded/"_sr, "\xff/conf/excluded0"_sr); const KeyRef excludedServersPrefix = excludedServersKeys.begin; diff --git a/fdbclient/Tracing.actor.cpp b/fdbclient/Tracing.actor.cpp index 6f33bbbd04..a6a1130305 100644 --- a/fdbclient/Tracing.actor.cpp +++ b/fdbclient/Tracing.actor.cpp @@ -355,21 +355,25 @@ Span& Span::operator=(Span&& o) { g_tracer->trace(*this); } arena = std::move(o.arena); - context = o.context; - parentContext = o.parentContext; - begin = o.begin; - end = o.end; - location = o.location; - links = std::move(o.links); - events = std::move(o.events); - status = o.status; - kind = o.kind; - o.context = SpanContext(); - o.parentContext = SpanContext(); - o.kind = SpanKind::INTERNAL; - o.begin = 0.0; - o.end = 0.0; - o.status = SpanStatus::UNSET; + // All memory referenced in *Ref fields of Span is now (potentially) + // invalid, and o no longer has ownership of any memory referenced by *Ref + // fields of o. We must ensure that o no longer references any memory it no + // longer owns, and that *this no longer references any memory it no longer + // owns. Not every field references arena memory, but this std::exchange + // pattern provides a nice template for getting this right in a concise way + // should we add more fields to Span. + + attributes = std::exchange(o.attributes, decltype(o.attributes)()); + begin = std::exchange(o.begin, decltype(o.begin)()); + context = std::exchange(o.context, decltype(o.context)()); + end = std::exchange(o.end, decltype(o.end)()); + events = std::exchange(o.events, decltype(o.events)()); + kind = std::exchange(o.kind, decltype(o.kind)()); + links = std::exchange(o.links, decltype(o.links)()); + location = std::exchange(o.location, decltype(o.location)()); + parentContext = std::exchange(o.parentContext, decltype(o.parentContext)()); + status = std::exchange(o.status, decltype(o.status)()); + return *this; } diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index 23abc7d974..6f8bfd4eff 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -314,13 +314,19 @@ struct BlobManifest { }; // Defines blob restore status +enum BlobRestorePhase { INIT = 0, LOAD_MANIFEST = 1, MANIFEST_DONE = 2, MIGRATE = 3, APPLY_MLOGS = 4, DONE = 5 }; struct BlobRestoreStatus { constexpr static FileIdentifier file_identifier = 378657; + BlobRestorePhase phase; int progress; + BlobRestoreStatus() : phase(BlobRestorePhase::INIT){}; + BlobRestoreStatus(BlobRestorePhase pha) : phase(pha), progress(0){}; + BlobRestoreStatus(BlobRestorePhase pha, int prog) : phase(pha), progress(prog){}; + template void serialize(Ar& ar) { - serializer(ar, progress); + serializer(ar, phase, progress); } }; diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 7532007541..90ac004621 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -289,6 +289,7 @@ public: double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY; double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT; int TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantEntryCache is refreshed + bool CLIENT_ENABLE_USING_CLUSTER_ID_KEY; // Encryption-at-rest bool ENABLE_ENCRYPTION_CPU_TIME_LOGGING; diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index 780b6ff1fc..80277bb0f7 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -1464,7 +1464,7 @@ struct TenantMode { struct EncryptionAtRestMode { // These enumerated values are stored in the database configuration, so can NEVER be changed. Only add new ones // just before END. - enum Mode { DISABLED = 0, AES_256_CTR = 1, END = 2 }; + enum Mode { DISABLED = 0, DOMAIN_AWARE = 1, CLUSTER_AWARE = 2, END = 3 }; EncryptionAtRestMode() : mode(DISABLED) {} EncryptionAtRestMode(Mode mode) : mode(mode) { @@ -1483,14 +1483,30 @@ struct EncryptionAtRestMode { switch (mode) { case DISABLED: return "disabled"; - case AES_256_CTR: - return "aes_256_ctr"; + case DOMAIN_AWARE: + return "domain_aware"; + case CLUSTER_AWARE: + return "cluster_aware"; default: ASSERT(false); } return ""; } + static EncryptionAtRestMode fromString(std::string mode) { + if (mode == "disabled") { + return EncryptionAtRestMode::DISABLED; + } else if (mode == "cluster_aware") { + return EncryptionAtRestMode::CLUSTER_AWARE; + } else if (mode == "domain_aware") { + return EncryptionAtRestMode::DOMAIN_AWARE; + } else { + TraceEvent(SevError, "UnknownEncryptMode").detail("EncryptMode", mode); + ASSERT(false); + throw internal_error(); + } + } + Value toValue() const { return ValueRef(format("%d", (int)mode)); } bool isEquals(const EncryptionAtRestMode& e) const { return this->mode == e.mode; } diff --git a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h index 352c32b1b3..63bbecf303 100644 --- a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h @@ -133,6 +133,11 @@ bool isCompleteConfiguration(std::map const& options); ConfigureAutoResult parseConfig(StatusObject const& status); +bool isEncryptionAtRestModeConfigValid(Optional oldConfiguration, + std::map newConfig, + bool creating); +bool isTenantModeModeConfigValid(DatabaseConfiguration oldConfiguration, DatabaseConfiguration newConfiguration); + // Management API written in template code to support both IClientAPI and NativeAPI namespace ManagementAPI { @@ -276,6 +281,9 @@ Future changeConfig(Reference db, std::map(), m, creating)) { + return ConfigurationResult::INVALID_CONFIGURATION; + } } else if (m.count(encryptionAtRestModeConfKey.toString()) != 0) { // Encryption data at-rest mode can be set only at the time of database creation return ConfigurationResult::ENCRYPTION_AT_REST_MODE_ALREADY_SET; @@ -322,6 +330,12 @@ Future changeConfig(Reference db, std::mapattributeKeys().count("dcid") && newConfig.regions.size() > 0) { return ConfigurationResult::REGION_REPLICATION_MISMATCH; diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index ed7a788f30..eb8b057db3 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -244,6 +244,8 @@ public: // in the TenantCache int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is // refreshed in the TenantCache + int TENANT_CACHE_STORAGE_USAGE_TRACE_INTERVAL; // The minimum interval between consecutive trace events logging the + // storage bytes used by a tenant group int CP_FETCH_TENANTS_OVER_STORAGE_QUOTA_INTERVAL; // How often the commit proxies send requests to the data // distributor to fetch the list of tenants over storage quota @@ -313,7 +315,7 @@ public: // KeyValueStoreRocksDB bool ROCKSDB_SET_READ_TIMEOUT; bool ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES; - int ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE; + bool ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE; int ROCKSDB_READ_RANGE_ROW_LIMIT; int ROCKSDB_READER_THREAD_PRIORITY; int ROCKSDB_WRITER_THREAD_PRIORITY; diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 27d19f0bef..e0366e8890 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -284,6 +284,9 @@ extern const KeyRef triggerDDTeamInfoPrintKey; // Encryption data at-rest config key extern const KeyRef encryptionAtRestModeConfKey; +// Tenant mode config key +extern const KeyRef tenantModeConfKey; + // The differences between excluded and failed can be found in "command-line-interface.rst" // and in the help message of the fdbcli command "exclude". diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index d2b1e34c40..6e0204fa80 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -235,7 +235,6 @@ struct TenantNameUniqueSet { return tenantNames.empty(); } }; - -class TenantPrefixIndex : public VersionedMap, public ReferenceCounted {}; +typedef VersionedMap TenantPrefixIndex; #endif diff --git a/fdbrpc/HTTP.actor.cpp b/fdbrpc/HTTP.actor.cpp index cf0bf6e157..aec87c5a50 100644 --- a/fdbrpc/HTTP.actor.cpp +++ b/fdbrpc/HTTP.actor.cpp @@ -243,7 +243,7 @@ ACTOR Future read_http_response(Reference r, Referenceheaders.find("Content-Length"); if (i != r->headers.end()) - r->contentLen = atoi(i->second.c_str()); + r->contentLen = strtoll(i->second.c_str(), NULL, 10); else r->contentLen = -1; // Content length unknown @@ -481,7 +481,7 @@ ACTOR Future> doRequest(Reference conn, } if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 0) { - printf("[%s] HTTP %scode=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %d]\n", + printf("[%s] HTTP %scode=%d early=%d, time=%fs %s %s contentLen=%d [%d out, response content len %lld]\n", conn->getDebugID().toString().c_str(), (err.present() ? format("*ERROR*=%s ", err.get().name()).c_str() : ""), r->code, @@ -491,7 +491,7 @@ ACTOR Future> doRequest(Reference conn, resource.c_str(), contentLen, total_sent, - (int)r->contentLen); + r->contentLen); } if (FLOW_KNOBS->HTTP_VERBOSE_LEVEL > 2) { printf("[%s] HTTP RESPONSE: %s %s\n%s\n", diff --git a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h index 144cfcf1f3..b8028af360 100644 --- a/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h +++ b/fdbrpc/include/fdbrpc/AsyncFileReadAhead.actor.h @@ -102,7 +102,7 @@ public: // If not found, start the read. if (i == f->m_blocks.end() || (i->second.isValid() && i->second.isError())) { // printf("starting read of %s block %d\n", f->getFilename().c_str(), blockNum); - fblock = readBlock(f.getPtr(), f->m_block_size, f->m_block_size * blockNum); + fblock = readBlock(f.getPtr(), f->m_block_size, (int64_t)f->m_block_size * blockNum); f->m_blocks[blockNum] = fblock; } else fblock = i->second; @@ -121,7 +121,7 @@ public: // Calculate the block-relative read range. It's a given that the offset / length range touches this block // so readStart will never be greater than blocksize (though it could be past the actual end of a short // block). - int64_t blockStart = blockNum * f->m_block_size; + int64_t blockStart = (int64_t)blockNum * f->m_block_size; int64_t readStart = std::max(0, offset - blockStart); int64_t readEnd = std::min(f->m_block_size, offset + length - blockStart); int rlen = readEnd - readStart; diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 93a8ad973a..e4abefa073 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -56,7 +56,7 @@ public: FailDisk, RebootAndDelete, RebootProcessAndDelete, - RebootProcessAndSwitch, + RebootProcessAndSwitch, // Reboot and switch cluster file Reboot, RebootProcess, None diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 845f1380e7..38963d4143 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -63,7 +63,8 @@ ISimulator::ISimulator() : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), usableRegions(1), allowLogSetKills(true), tssMode(TSSMode::Disabled), configDBType(ConfigDBType::DISABLED), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), - backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType), allSwapsDisabled(false) {} + backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType), allSwapsDisabled(false), + blobGranulesEnabled(false) {} ISimulator::~ISimulator() = default; bool simulator_should_inject_fault(const char* context, const char* file, int line, int error_code) { diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 488d35b3c3..f0aa4db6fa 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -49,8 +49,8 @@ struct VersionedMessage { Arena decryptArena; // Arena used for decrypt buffer. size_t bytes; // arena's size when inserted, which can grow afterwards - VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef& t, const Arena& a) - : version(v), message(m), tags(t), arena(a), bytes(a.getSize()) {} + VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef& t, const Arena& a, size_t n) + : version(v), message(m), tags(t), arena(a), bytes(n) {} Version getVersion() const { return version.version; } uint32_t getSubVersion() const { return version.sub; } @@ -977,15 +977,17 @@ ACTOR Future pullAsyncData(BackupData* self) { // Note we aggressively peek (uncommitted) messages, but only committed // messages/mutations will be flushed to disk/blob in uploadData(). while (r->hasMessage()) { + state size_t takeBytes = 0; if (!prev.sameArena(r->arena())) { TraceEvent(SevDebugMemory, "BackupWorkerMemory", self->myId) .detail("Take", r->arena().getSize()) .detail("Current", self->lock->activePermits()); - wait(self->lock->take(TaskPriority::DefaultYield, r->arena().getSize())); + takeBytes = r->arena().getSize(); // more bytes can be allocated after the wait. + wait(self->lock->take(TaskPriority::DefaultYield, takeBytes)); prev = r->arena(); } - self->messages.emplace_back(r->version(), r->getMessage(), r->getTags(), r->arena()); + self->messages.emplace_back(r->version(), r->getMessage(), r->getTags(), r->arena(), takeBytes); r->nextMessage(); } diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index 419c3f4634..2f564dbea0 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -3547,10 +3547,16 @@ ACTOR Future recoverBlobManager(Reference bmData) { bool isFullRestore = wait(isFullRestoreMode(bmData->db, normalKeys)); bmData->isFullRestoreMode = isFullRestore; if (bmData->isFullRestoreMode) { + BlobRestoreStatus initStatus(BlobRestorePhase::LOAD_MANIFEST); + wait(updateRestoreStatus(bmData->db, normalKeys, initStatus)); + wait(loadManifest(bmData->db, bmData->bstore)); int64_t epoc = wait(lastBlobEpoc(bmData->db, bmData->bstore)); wait(updateEpoch(bmData, epoc + 1)); + + BlobRestoreStatus completedStatus(BlobRestorePhase::MANIFEST_DONE); + wait(updateRestoreStatus(bmData->db, normalKeys, completedStatus)); } state Reference tr = makeReference(bmData->db); diff --git a/fdbserver/BlobManifest.actor.cpp b/fdbserver/BlobManifest.actor.cpp index 45f4496b98..16c2d9ac99 100644 --- a/fdbserver/BlobManifest.actor.cpp +++ b/fdbserver/BlobManifest.actor.cpp @@ -545,7 +545,7 @@ ACTOR Future isFullRestoreMode(Database db, KeyRangeRef keys) { KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key); if (keyRange.contains(keys)) { Standalone status = decodeBlobRestoreStatus(r.value); - return status.progress < 100; // progress is less than 100 + return status.phase < BlobRestorePhase::DONE; } } if (!ranges.more) { @@ -563,3 +563,44 @@ ACTOR Future isFullRestoreMode(Database db, KeyRangeRef keys) { } } } + +// Update restore status +ACTOR Future updateRestoreStatus(Database db, KeyRangeRef range, BlobRestoreStatus status) { + state Transaction tr(db); + loop { + try { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + Key key = blobRestoreCommandKeyFor(range); + Value value = blobRestoreCommandValueFor(status); + tr.set(key, value); + wait(tr.commit()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + +// Get restore status +ACTOR Future> getRestoreStatus(Database db, KeyRangeRef range) { + state Transaction tr(db); + loop { + try { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + Key key = blobRestoreCommandKeyFor(range); + Optional value = wait(tr.get(key)); + Optional result; + if (value.present()) { + Standalone status = decodeBlobRestoreStatus(value.get()); + result = status; + } + return result; + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} diff --git a/fdbserver/BlobMigrator.actor.cpp b/fdbserver/BlobMigrator.actor.cpp index 0c23ed7904..5a8f2b78ab 100644 --- a/fdbserver/BlobMigrator.actor.cpp +++ b/fdbserver/BlobMigrator.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/BlobGranuleCommon.h" #include "flow/ActorCollection.h" #include "flow/FastRef.h" #include "flow/IRandom.h" @@ -75,8 +76,8 @@ private: // Check if blob manifest is loaded so that blob migration can start ACTOR static Future checkIfReadyForMigration(Reference self) { loop { - bool isFullRestore = wait(isFullRestoreMode(self->db_, normalKeys)); - if (isFullRestore) { + Optional status = wait(getRestoreStatus(self->db_, normalKeys)); + if (canStartMigration(status)) { BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_)); if (!granules.empty()) { self->blobGranules_ = granules; @@ -87,6 +88,9 @@ private: .detail("Version", granule.version) .detail("SizeInBytes", granule.sizeInBytes); } + + BlobRestoreStatus status(BlobRestorePhase::MIGRATE, 0); + wait(updateRestoreStatus(self->db_, normalKeys, status)); return Void(); } } @@ -94,6 +98,15 @@ private: } } + // Check if we should start migration. Migration can be started after manifest is fully loaded + static bool canStartMigration(Optional status) { + if (status.present()) { + BlobRestoreStatus value = status.get(); + return value.phase == BlobRestorePhase::MANIFEST_DONE; // manifest is loaded successfully + } + return false; + } + // Prepare for data migration for given key range. ACTOR static Future prepare(Reference self, KeyRangeRef keys) { // Register as a storage server, so that DataDistributor could start data movement after @@ -120,8 +133,8 @@ private: tr.setOption(FDBTransactionOptions::LOCK_AWARE); try { state Value value = keyServersValue(std::vector({ serverUID }), std::vector(), UID(), UID()); - wait(krmSetRange(&tr, keyServersPrefix, keys, value)); - wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue)); + wait(krmSetRangeCoalescing(&tr, keyServersPrefix, keys, allKeys, value)); + wait(krmSetRangeCoalescing(&tr, serverKeysPrefixFor(serverUID), keys, allKeys, serverKeysTrue)); wait(tr.commit()); dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString()); return Void(); @@ -152,7 +165,7 @@ private: } } if (owning) { - wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse)); + wait(krmSetRangeCoalescing(&tr, serverKeysPrefixFor(id), keys, allKeys, serverKeysFalse)); dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString()); TraceEvent("UnassignKeys").detail("Keys", keys.toString()).detail("From", id.toString()); } @@ -169,8 +182,12 @@ private: ACTOR static Future logProgress(Reference self) { loop { bool done = wait(checkProgress(self)); - if (done) + if (done) { + BlobRestoreStatus status(BlobRestorePhase::DONE); + wait(updateRestoreStatus(self->db_, normalKeys, status)); + return Void(); + } wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL)); } } @@ -205,7 +222,8 @@ private: state bool done = incompleted == 0; dprint("Migration progress :{}%. done {}\n", progress, done); TraceEvent("BlobMigratorProgress").detail("Progress", progress).detail("Done", done); - wait(updateProgress(self, normalKeys, progress)); + BlobRestoreStatus status(BlobRestorePhase::MIGRATE, progress); + wait(updateRestoreStatus(self->db_, normalKeys, status)); return done; } catch (Error& e) { wait(tr.onError(e)); @@ -213,32 +231,6 @@ private: } } - // Update restore progress - ACTOR static Future updateProgress(Reference self, KeyRangeRef range, int progress) { - state Transaction tr(self->db_); - loop { - try { - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - state Key key = blobRestoreCommandKeyFor(range); - Optional value = wait(tr.get(key)); - if (value.present()) { - Standalone status = decodeBlobRestoreStatus(value.get()); - if (progress > status.progress) { - status.progress = progress; - Value updatedValue = blobRestoreCommandValueFor(status); - tr.set(key, updatedValue); - wait(tr.commit()); - } - } - return Void(); - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - // Advance version, so that future commits will have a larger version than the restored data ACTOR static Future advanceVersion(Reference self) { state Transaction tr(self->db_); diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index cc23e68f8d..d3e5b72af5 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -26,6 +26,7 @@ #include #include +#include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/SystemData.h" #include "fdbclient/DatabaseContext.h" @@ -2565,8 +2566,8 @@ ACTOR Future watchBlobRestoreCommand(ClusterControllerData* self) { Optional blobRestoreCommand = wait(tr->get(blobRestoreCommandKey)); if (blobRestoreCommand.present()) { Standalone status = decodeBlobRestoreStatus(blobRestoreCommand.get()); - TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress); - if (status.progress == 0) { + TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress).detail("Phase", status.phase); + if (status.phase == BlobRestorePhase::INIT) { self->db.blobRestoreEnabled.set(true); if (self->db.blobGranulesEnabled.get()) { const auto& blobManager = self->db.serverInfo->get().blobManager; diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index 9b525cf54e..346a59ce87 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -435,7 +435,7 @@ namespace { EncryptionAtRestMode getEncryptionAtRest() { // TODO: Use db-config encryption config to determine cluster encryption status if (SERVER_KNOBS->ENABLE_ENCRYPTION) { - return EncryptionAtRestMode(EncryptionAtRestMode::Mode::AES_256_CTR); + return EncryptionAtRestMode(EncryptionAtRestMode::Mode::DOMAIN_AWARE); } else { return EncryptionAtRestMode(); } diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 67cbee409e..bda2b5069c 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -2910,7 +2910,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, ASSERT(commitData.resolvers.size() != 0); for (int i = 0; i < commitData.resolvers.size(); ++i) { commitData.stats.resolverDist.push_back(Histogram::getHistogram( - "CommitProxy"_sr, "ToResolver_" + commitData.resolvers[i].id().toString(), Histogram::Unit::microseconds)); + "CommitProxy"_sr, "ToResolver_" + commitData.resolvers[i].id().toString(), Histogram::Unit::milliseconds)); } // Initialize keyResolvers map diff --git a/fdbserver/ConfigBroadcaster.actor.cpp b/fdbserver/ConfigBroadcaster.actor.cpp index bf49f8e58a..f675a3fe8f 100644 --- a/fdbserver/ConfigBroadcaster.actor.cpp +++ b/fdbserver/ConfigBroadcaster.actor.cpp @@ -92,10 +92,10 @@ class ConfigBroadcasterImpl { // Used to read a snapshot from the previous coordinators after a change // coordinators command. - Version maxLastSeenVersion = ::invalidVersion; Future> previousCoordinatorsFuture; std::unique_ptr previousCoordinatorsConsumer; Future previousCoordinatorsSnapshotFuture; + Version largestConfigNodeVersion{ ::invalidVersion }; UID id; CounterCollection cc; @@ -106,6 +106,7 @@ class ConfigBroadcasterImpl { Future logger; int coordinators = 0; + std::unordered_set registeredConfigNodes; std::unordered_set activeConfigNodes; std::unordered_set registrationResponses; std::unordered_set registrationResponsesUnregistered; @@ -268,7 +269,7 @@ class ConfigBroadcasterImpl { // Ask the registering ConfigNode whether it has registered in the past. state ConfigBroadcastRegisteredReply reply = wait( brokenPromiseToNever(configBroadcastInterface.registered.getReply(ConfigBroadcastRegisteredRequest{}))); - self->maxLastSeenVersion = std::max(self->maxLastSeenVersion, reply.lastSeenVersion); + self->largestConfigNodeVersion = std::max(self->largestConfigNodeVersion, reply.lastSeenVersion); state bool registered = reply.registered; TraceEvent("ConfigBroadcasterRegisterNodeReceivedRegistrationReply", self->id) .detail("Address", address) @@ -302,6 +303,7 @@ class ConfigBroadcasterImpl { int nodesTillQuorum = self->coordinators / 2 + 1 - (int)self->activeConfigNodes.size(); if (registered) { + self->registeredConfigNodes.insert(address); self->activeConfigNodes.insert(address); self->disallowUnregistered = true; } else if ((self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) || @@ -365,6 +367,52 @@ class ConfigBroadcasterImpl { state bool sendSnapshot = self->previousCoordinatorsConsumer && reply.lastSeenVersion <= self->mostRecentVersion; + + // If a coordinator change is ongoing, a quorum of ConfigNodes are + // already registered and the largest version at least one of those + // ConfigNodes knows about is greater than the version of the latest + // snapshot the broadcaster has, don't send a snapshot to any + // ConfigNodes. This could end up overwriting committed data. Consider + // the following scenario, with three ConfigNodes: + // + // T=0: + // A: v5 + // T=1: + // change coordinators, new coordinators are B, C, D + // T=2: + // B: v5, C: v5, D: v5 + // T=3: + // B: v5, C: v10, D: v10 + // (some commits happen on only C and D) + // (previousCoordinatorsKey has not been cleared yet) + // T=4: + // D dies and loses its data + // T=5: + // D starts + // B: v5 (registered=yes), C: v10 (registered=yes), D: v0 (registered=no) + // Broadcaster: has an old snapshot, only knows about v5 + // self->mostRecentVersion=5 + // T=6: + // B, C, D (re-)register with broadcaster + // + // At T=5, the broadcaster would send snapshots to B and D because the + // largest version they know about (5) is less than or equal to + // self->mostRecentVersion (5). But this would cause a majority of + // nodes to think v5 is the latest committed version, causing C to be + // rolled back, and losing commit data between versions 5 and 10. + // + // This is a special case where the coordinators are being changed. + // During a coordinator change, a majority of ConfigNodes being + // registered means the coordinator change already took place, and it + // is being retried due to some failure. In that case, we don't want to + // resend snapshots if a majority of the new ConfigNodes are + // registered, because they could have been accepting commits. Instead, + // let the rollback/rollforward algorithm update the out of date nodes. + if (self->previousCoordinatorsConsumer && self->largestConfigNodeVersion > self->mostRecentVersion && + self->registeredConfigNodes.size() >= self->coordinators / 2 + 1) { + sendSnapshot = false; + } + // Unregistered nodes need to wait for either: // 1. A quorum of registered nodes to register and send their // snapshots, so the unregistered nodes can be rolled forward, or diff --git a/fdbserver/ConfigNode.actor.cpp b/fdbserver/ConfigNode.actor.cpp index 8c2db21872..82ce05e590 100644 --- a/fdbserver/ConfigNode.actor.cpp +++ b/fdbserver/ConfigNode.actor.cpp @@ -234,10 +234,13 @@ class ConfigNodeImpl { req.reply.sendError(process_behind()); // Reuse the process_behind error return Void(); } + if (BUGGIFY) { + wait(delay(deterministicRandom()->random01() * 2)); + } state Standalone> versionedMutations = - wait(getMutations(self, req.lastSeenVersion + 1, committedVersion)); + wait(getMutations(self, req.lastSeenVersion + 1, req.mostRecentVersion)); state Standalone> versionedAnnotations = - wait(getAnnotations(self, req.lastSeenVersion + 1, committedVersion)); + wait(getAnnotations(self, req.lastSeenVersion + 1, req.mostRecentVersion)); TraceEvent(SevInfo, "ConfigNodeSendingChanges", self->id) .detail("ReqLastSeenVersion", req.lastSeenVersion) .detail("ReqMostRecentVersion", req.mostRecentVersion) @@ -245,7 +248,7 @@ class ConfigNodeImpl { .detail("NumMutations", versionedMutations.size()) .detail("NumCommits", versionedAnnotations.size()); ++self->successfulChangeRequests; - req.reply.send(ConfigFollowerGetChangesReply{ committedVersion, versionedMutations, versionedAnnotations }); + req.reply.send(ConfigFollowerGetChangesReply{ versionedMutations, versionedAnnotations }); return Void(); } @@ -520,6 +523,18 @@ class ConfigNodeImpl { ObjectReader::fromStringRef(kv.value, IncludeVersion()); } wait(store(reply.snapshotVersion, getLastCompactedVersion(self))); + if (req.mostRecentVersion < reply.snapshotVersion) { + // The version in the request can be less than the last compacted + // version in certain circumstances where the coordinators are + // being changed and the consumer reads the latest committed + // version from a majority of ConfigNodes before they have received + // up to date snapshots. This should be fine, it just means the + // consumer needs to fetch the latest version and retry its + // request. + CODE_PROBE(true, "ConfigNode ahead of consumer", probe::decoration::rare); + req.reply.sendError(version_already_compacted()); + return Void(); + } wait(store(reply.changes, getMutations(self, reply.snapshotVersion + 1, req.mostRecentVersion))); wait(store(reply.annotations, getAnnotations(self, reply.snapshotVersion + 1, req.mostRecentVersion))); TraceEvent(SevInfo, "ConfigNodeGettingSnapshot", self->id) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index ea5eef4848..5484299ceb 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -1548,14 +1548,20 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, if (enableShardMove && tciIndex == 1) { ASSERT(physicalShardIDCandidate != UID().first() && physicalShardIDCandidate != anonymousShardId.first()); - Optional remoteTeamWithPhysicalShard = + std::pair, bool> remoteTeamWithPhysicalShard = self->physicalShardCollection->tryGetAvailableRemoteTeamWith( physicalShardIDCandidate, metrics, debugID); - // TODO: when we know that `physicalShardIDCandidate` exists, remote team must also exists. - if (remoteTeamWithPhysicalShard.present()) { + if (!remoteTeamWithPhysicalShard.second) { + // Physical shard with `physicalShardIDCandidate` is not available. Retry selecting new + // dst physical shard. + self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++; + foundTeams = false; + break; + } + if (remoteTeamWithPhysicalShard.first.present()) { // Exists a remoteTeam in the mapping that has the physicalShardIDCandidate // use the remoteTeam with the physicalShard as the bestTeam - req = GetTeamRequest(remoteTeamWithPhysicalShard.get().servers); + req = GetTeamRequest(remoteTeamWithPhysicalShard.first.get().servers); } } @@ -1853,19 +1859,35 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, state Error error = success(); state Promise dataMovementComplete; // Move keys from source to destination by changing the serverKeyList and keyServerList system keys - state Future doMoveKeys = - self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId, - rd.keys, - destIds, - healthyIds, - self->lock, - dataMovementComplete, - &self->startMoveKeysParallelismLock, - &self->finishMoveKeysParallelismLock, - self->teamCollections.size() > 1, - relocateShardInterval.pairID, - ddEnabledState, - CancelConflictingDataMoves::False }); + std::unique_ptr params; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + params = std::make_unique(rd.dataMoveId, + std::vector{ rd.keys }, + destIds, + healthyIds, + self->lock, + dataMovementComplete, + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + self->teamCollections.size() > 1, + relocateShardInterval.pairID, + ddEnabledState, + CancelConflictingDataMoves::False); + } else { + params = std::make_unique(rd.dataMoveId, + rd.keys, + destIds, + healthyIds, + self->lock, + dataMovementComplete, + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + self->teamCollections.size() > 1, + relocateShardInterval.pairID, + ddEnabledState, + CancelConflictingDataMoves::False); + } + state Future doMoveKeys = self->txnProcessor->moveKeys(*params); state Future pollHealth = signalledTransferComplete ? Never() : delay(SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch); @@ -1878,19 +1900,35 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, healthyIds.insert(healthyIds.end(), extraIds.begin(), extraIds.end()); extraIds.clear(); ASSERT(totalIds == destIds.size()); // Sanity check the destIDs before we move keys - doMoveKeys = - self->txnProcessor->moveKeys(MoveKeysParams{ rd.dataMoveId, - rd.keys, - destIds, - healthyIds, - self->lock, - Promise(), - &self->startMoveKeysParallelismLock, - &self->finishMoveKeysParallelismLock, - self->teamCollections.size() > 1, - relocateShardInterval.pairID, - ddEnabledState, - CancelConflictingDataMoves::False }); + std::unique_ptr params; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + params = std::make_unique(rd.dataMoveId, + std::vector{ rd.keys }, + destIds, + healthyIds, + self->lock, + Promise(), + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + self->teamCollections.size() > 1, + relocateShardInterval.pairID, + ddEnabledState, + CancelConflictingDataMoves::False); + } else { + params = std::make_unique(rd.dataMoveId, + rd.keys, + destIds, + healthyIds, + self->lock, + Promise(), + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + self->teamCollections.size() > 1, + relocateShardInterval.pairID, + ddEnabledState, + CancelConflictingDataMoves::False); + } + doMoveKeys = self->txnProcessor->moveKeys(*params); } else { self->fetchKeysComplete.insert(rd); if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp index fddbf25f14..69bb6c853a 100644 --- a/fdbserver/DDShardTracker.actor.cpp +++ b/fdbserver/DDShardTracker.actor.cpp @@ -1756,7 +1756,7 @@ InOverSizePhysicalShard PhysicalShardCollection::isInOverSizePhysicalShard(KeyRa } // May return a problematic remote team -Optional PhysicalShardCollection::tryGetAvailableRemoteTeamWith( +std::pair, bool> PhysicalShardCollection::tryGetAvailableRemoteTeamWith( uint64_t inputPhysicalShardID, StorageMetrics const& moveInMetrics, uint64_t debugID) { @@ -1764,10 +1764,10 @@ Optional PhysicalShardCollection::tryGetAvail ASSERT(SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD); ASSERT(inputPhysicalShardID != anonymousShardId.first() && inputPhysicalShardID != UID().first()); if (physicalShardInstances.count(inputPhysicalShardID) == 0) { - return Optional(); + return { Optional(), true }; } if (!checkPhysicalShardAvailable(inputPhysicalShardID, moveInMetrics)) { - return Optional(); + return { Optional(), false }; } for (auto team : physicalShardInstances[inputPhysicalShardID].teams) { if (team.primary == false) { @@ -1777,10 +1777,12 @@ Optional PhysicalShardCollection::tryGetAvail .detail("TeamSize", team.servers.size()) .detail("PhysicalShardsOfTeam", convertIDsToString(teamPhysicalShardIDs[team])) .detail("DebugID", debugID);*/ - return team; + return { team, true }; } } - UNREACHABLE(); + // In this case, the physical shard may not be populated in the remote region yet, e.g., we are making a + // configuration change to turn a single region cluster into HA mode. + return { Optional(), true }; } // The update of PhysicalShardToTeams, Collection, keyRangePhysicalShardIDMap should be atomic diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index 9907f22784..404782717d 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -723,6 +723,17 @@ struct DDMockTxnProcessorImpl { return Void(); } + static Future rawCheckFetchingState(DDMockTxnProcessor* self, const MoveKeysParams& params) { + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); + // TODO: make startMoveShards work with multiple ranges. + ASSERT(params.ranges.get().size() == 1); + return checkFetchingState(self, params.destinationTeam, params.ranges.get().at(0)); + } + ASSERT(params.keys.present()); + return checkFetchingState(self, params.destinationTeam, params.keys.get()); + } + ACTOR static Future moveKeys(DDMockTxnProcessor* self, MoveKeysParams params) { state std::map tssMapping; // Because SFBTF::Team requires the ID is ordered @@ -732,7 +743,7 @@ struct DDMockTxnProcessorImpl { wait(self->rawStartMovement(params, tssMapping)); ASSERT(tssMapping.empty()); - wait(checkFetchingState(self, params.destinationTeam, params.keys)); + wait(rawCheckFetchingState(self, params)); wait(self->rawFinishMovement(params, tssMapping)); if (!params.dataMovementComplete.isSet()) @@ -915,6 +926,16 @@ Future> DDMockTxnProcessor::getWorkers() const { ACTOR Future rawStartMovement(std::shared_ptr mgs, MoveKeysParams params, std::map tssMapping) { + state KeyRange keys; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); + // TODO: make startMoveShards work with multiple ranges. + ASSERT(params.ranges.get().size() == 1); + keys = params.ranges.get().at(0); + } else { + ASSERT(params.keys.present()); + keys = params.keys.get(); + } // There won’t be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code // will always finish without coroutine switch. ASSERT(params.startMoveKeysParallelismLock->activePermits() == 0); @@ -925,15 +946,15 @@ ACTOR Future rawStartMovement(std::shared_ptr mgs, destTeams.emplace_back(params.destinationTeam, true); // invariant: the splitting and merge operation won't happen at the same moveKeys action. For example, if [a,c) [c, // e) exists, the params.keys won't be [b, d). - auto intersectRanges = mgs->shardMapping->intersectingRanges(params.keys); + auto intersectRanges = mgs->shardMapping->intersectingRanges(keys); // 1. splitting or just move a range. The new boundary need to be defined in startMovement - if (intersectRanges.begin().range().contains(params.keys)) { - mgs->shardMapping->defineShard(params.keys); + if (intersectRanges.begin().range().contains(keys)) { + mgs->shardMapping->defineShard(keys); } // 2. merge ops will coalesce the boundary in finishMovement; - intersectRanges = mgs->shardMapping->intersectingRanges(params.keys); - ASSERT(params.keys.begin == intersectRanges.begin().begin()); - ASSERT(params.keys.end == intersectRanges.end().begin()); + intersectRanges = mgs->shardMapping->intersectingRanges(keys); + ASSERT(keys.begin == intersectRanges.begin().begin()); + ASSERT(keys.end == intersectRanges.end().begin()); for (auto it = intersectRanges.begin(); it != intersectRanges.end(); ++it) { auto teamPair = mgs->shardMapping->getTeamsFor(it->begin()); @@ -945,8 +966,8 @@ ACTOR Future rawStartMovement(std::shared_ptr mgs, deterministicRandom()->randomInt64(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES); for (auto& id : params.destinationTeam) { auto& server = mgs->allServers.at(id); - server.setShardStatus(params.keys, MockShardStatus::INFLIGHT, mgs->restrictSize); - server.signalFetchKeys(params.keys, randomRangeSize); + server.setShardStatus(keys, MockShardStatus::INFLIGHT, mgs->restrictSize); + server.signalFetchKeys(keys, randomRangeSize); } return Void(); } @@ -959,6 +980,17 @@ Future DDMockTxnProcessor::rawStartMovement(const MoveKeysParams& params, ACTOR Future rawFinishMovement(std::shared_ptr mgs, MoveKeysParams params, std::map tssMapping) { + state KeyRange keys; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); + // TODO: make startMoveShards work with multiple ranges. + ASSERT(params.ranges.get().size() == 1); + keys = params.ranges.get().at(0); + } else { + ASSERT(params.keys.present()); + keys = params.keys.get(); + } + // There won’t be parallel rawStart or rawFinish in mock world due to the fact the following *mock* transaction code // will always finish without coroutine switch. ASSERT(params.finishMoveKeysParallelismLock->activePermits() == 0); @@ -966,7 +998,7 @@ ACTOR Future rawFinishMovement(std::shared_ptr mgs, state FlowLock::Releaser releaser(*params.finishMoveKeysParallelismLock); // get source and dest teams - auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys); + auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(keys); ASSERT_EQ(destTeams.size(), 1); // Will the multi-region or dynamic replica make destTeam.size() > 1? if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) { @@ -978,7 +1010,7 @@ ACTOR Future rawFinishMovement(std::shared_ptr mgs, } for (auto& id : params.destinationTeam) { - mgs->allServers.at(id).setShardStatus(params.keys, MockShardStatus::COMPLETED, mgs->restrictSize); + mgs->allServers.at(id).setShardStatus(keys, MockShardStatus::COMPLETED, mgs->restrictSize); } // remove destination servers from source servers @@ -986,11 +1018,11 @@ ACTOR Future rawFinishMovement(std::shared_ptr mgs, for (auto& id : srcTeams.front().servers) { // the only caller moveKeys will always make sure the UID are sorted if (!std::binary_search(params.destinationTeam.begin(), params.destinationTeam.end(), id)) { - mgs->allServers.at(id).removeShard(params.keys); + mgs->allServers.at(id).removeShard(keys); } } - mgs->shardMapping->finishMove(params.keys); - mgs->shardMapping->defineShard(params.keys); // coalesce for merge + mgs->shardMapping->finishMove(keys); + mgs->shardMapping->defineShard(keys); // coalesce for merge return Void(); } diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index 49fbe4445b..cac2bc24d1 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -134,9 +134,9 @@ struct GrvProxyStats { recentRequests(0), lastBucketBegin(now()), bucketInterval(FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE / FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS), grvConfirmEpochLiveDist( - Histogram::getHistogram("GrvProxy"_sr, "GrvConfirmEpochLive"_sr, Histogram::Unit::microseconds)), + Histogram::getHistogram("GrvProxy"_sr, "GrvConfirmEpochLive"_sr, Histogram::Unit::milliseconds)), grvGetCommittedVersionRpcDist( - Histogram::getHistogram("GrvProxy"_sr, "GrvGetCommittedVersionRpc"_sr, Histogram::Unit::microseconds)) { + Histogram::getHistogram("GrvProxy"_sr, "GrvGetCommittedVersionRpc"_sr, Histogram::Unit::milliseconds)) { // The rate at which the limit(budget) is allowed to grow. specialCounter(cc, "SystemGRVQueueSize", [this]() { return this->systemGRVQueueSize; }); specialCounter(cc, "DefaultGRVQueueSize", [this]() { return this->defaultGRVQueueSize; }); diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index b7f435d967..3053e38e31 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -68,12 +68,9 @@ #ifdef SSD_ROCKSDB_EXPERIMENTAL -// Enforcing rocksdb version to be 6.27.3 or greater. -static_assert(ROCKSDB_MAJOR >= 6, "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); -static_assert(ROCKSDB_MAJOR == 6 ? ROCKSDB_MINOR >= 27 : true, - "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); -static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : true, - "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); +// Enforcing rocksdb version to be 7.7.3. +static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3), + "Unsupported rocksdb version. Update the rocksdb to 7.7.3 version"); namespace { using rocksdb::BackgroundErrorReason; @@ -901,6 +898,7 @@ ACTOR Future rocksDBMetricLogger(UID id, }; // To control the rocksdb::StatsLevel, use ROCKSDB_STATS_LEVEL knob. + // Refer StatsLevel: https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h#L594 state std::vector> histogramStats = { { "CompactionTime", rocksdb::COMPACTION_TIME }, // enabled if rocksdb::StatsLevel > kExceptTimers(2) { "CompactionCPUTime", rocksdb::COMPACTION_CPU_TIME }, // enabled if rocksdb::StatsLevel > kExceptTimers(2) @@ -970,6 +968,7 @@ ACTOR Future rocksDBMetricLogger(UID id, } // None of the histogramStats are enabled unless the ROCKSDB_STATS_LEVEL > kExceptHistogramOrTimers(1) + // Refer StatsLevel: https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h#L594 if (SERVER_KNOBS->ROCKSDB_STATS_LEVEL > rocksdb::kExceptHistogramOrTimers) { for (auto& [name, histogram] : histogramStats) { rocksdb::HistogramData histogram_data; @@ -1031,7 +1030,10 @@ void logRocksDBError(UID id, Optional sev = Optional()) { Severity level = sev.present() ? sev.get() : (status.IsTimedOut() ? SevWarn : SevError); TraceEvent e(level, "RocksDBError", id); - e.detail("Error", status.ToString()).detail("Method", method).detail("RocksDBSeverity", status.severity()); + e.setMaxFieldLength(10000) + .detail("Error", status.ToString()) + .detail("Method", method) + .detail("RocksDBSeverity", status.severity()); if (status.IsIOError()) { e.detail("SubCode", status.subcode()); } @@ -1253,15 +1255,18 @@ struct RocksDBKeyValueStore : IKeyValueStore { std::make_pair(ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM.toString(), commitBeginTime - a.startTime)); } Standalone> deletes; - DeleteVisitor dv(deletes, deletes.arena()); - rocksdb::Status s = a.batchToCommit->Iterate(&dv); - if (!s.ok()) { - logRocksDBError(id, s, "CommitDeleteVisitor"); - a.done.sendError(statusToError(s)); - return; + if (SERVER_KNOBS->ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE) { + DeleteVisitor dv(deletes, deletes.arena()); + rocksdb::Status s = a.batchToCommit->Iterate(&dv); + if (!s.ok()) { + logRocksDBError(id, s, "CommitDeleteVisitor"); + a.done.sendError(statusToError(s)); + return; + } + // If there are any range deletes, we should have added them to be deleted. + ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange()); } - // If there are any range deletes, we should have added them to be deleted. - ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange()); + rocksdb::WriteOptions options; options.sync = !SERVER_KNOBS->ROCKSDB_UNSAFE_AUTO_FSYNC; if (SERVER_KNOBS->ROCKSDB_DISABLE_WAL_EXPERIMENTAL) { @@ -1275,7 +1280,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { // Request for batchToCommit bytes. If this request cannot be satisfied, the call is blocked. rateLimiter->Request(a.batchToCommit->GetDataSize() /* bytes */, rocksdb::Env::IO_HIGH); } - s = db->Write(options, a.batchToCommit.get()); + rocksdb::Status s = db->Write(options, a.batchToCommit.get()); readIterPool->update(); double currTime = timer_monotonic(); sharedState->dbWriteLatency.addMeasurement(currTime - writeBeginTime); @@ -1402,17 +1407,11 @@ struct RocksDBKeyValueStore : IKeyValueStore { ThreadReturnPromiseStream>* metricPromiseStream) : id(id), db(db), cf(cf), sharedState(sharedState), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), metricPromiseStream(metricPromiseStream), threadIndex(threadIndex) { - if (g_network->isSimulated()) { - // In simulation, increasing the read operation timeouts to 5 minutes, as some of the tests have - // very high load and single read thread cannot process all the load within the timeouts. - readValueTimeout = 5 * 60; - readValuePrefixTimeout = 5 * 60; - readRangeTimeout = 5 * 60; - } else { - readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT; - readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT; - readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT; - } + + readValueTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_TIMEOUT; + readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT; + readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT; + if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) { // Enable perf context on the same thread with the db thread rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex); @@ -1792,39 +1791,39 @@ struct RocksDBKeyValueStore : IKeyValueStore { ACTOR Future updateHistogram(FutureStream> metricFutureStream) { state Reference commitLatencyHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds); state Reference commitActionHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::milliseconds); state Reference commitQueueWaitHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds); state Reference writeHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::milliseconds); state Reference deleteCompactRangeHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readRangeLatencyHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readValueLatencyHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readPrefixLatencyHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readRangeActionHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readValueActionHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readPrefixActionHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readRangeQueueWaitHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readValueQueueWaitHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readPrefixQueueWaitHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readRangeNewIteratorHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readValueGetHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::milliseconds); state Reference readPrefixGetHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::milliseconds); loop { choose { when(std::pair measure = waitNext(metricFutureStream)) { diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp index 53edcc1d95..fec2bf6167 100644 --- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp @@ -41,12 +41,9 @@ #ifdef SSD_ROCKSDB_EXPERIMENTAL -// Enforcing rocksdb version to be 6.27.3 or greater. -static_assert(ROCKSDB_MAJOR >= 6, "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); -static_assert(ROCKSDB_MAJOR == 6 ? ROCKSDB_MINOR >= 27 : true, - "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); -static_assert((ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR == 27) ? ROCKSDB_PATCH >= 3 : true, - "Unsupported rocksdb version. Update the rocksdb to 6.27.3 version"); +// Enforcing rocksdb version to be 7.7.3. +static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3), + "Unsupported rocksdb version. Update the rocksdb to 7.7.3 version"); const std::string rocksDataFolderSuffix = "-data"; const std::string METADATA_SHARD_ID = "kvs-metadata"; @@ -170,7 +167,10 @@ std::string getShardMappingKey(KeyRef key, StringRef prefix) { void logRocksDBError(const rocksdb::Status& status, const std::string& method) { auto level = status.IsTimedOut() ? SevWarn : SevError; TraceEvent e(level, "ShardedRocksDBError"); - e.detail("Error", status.ToString()).detail("Method", method).detail("ShardedRocksDBSeverity", status.severity()); + e.setMaxFieldLength(10000) + .detail("Error", status.ToString()) + .detail("Method", method) + .detail("ShardedRocksDBSeverity", status.severity()); if (status.IsIOError()) { e.detail("SubCode", status.subcode()); } @@ -449,7 +449,8 @@ struct DataShard { // PhysicalShard represent a collection of logical shards. A PhysicalShard could have one or more DataShards. A // PhysicalShard is stored as a column family in rocksdb. Each PhysicalShard has its own iterator pool. struct PhysicalShard { - PhysicalShard(rocksdb::DB* db, std::string id) : db(db), id(id), isInitialized(false) {} + PhysicalShard(rocksdb::DB* db, std::string id, const rocksdb::ColumnFamilyOptions& options) + : db(db), id(id), cfOptions(options), isInitialized(false) {} PhysicalShard(rocksdb::DB* db, std::string id, rocksdb::ColumnFamilyHandle* handle) : db(db), id(id), cf(handle), isInitialized(true) { ASSERT(cf); @@ -460,7 +461,7 @@ struct PhysicalShard { if (cf) { return rocksdb::Status::OK(); } - auto status = db->CreateColumnFamily(getCFOptions(), id, &cf); + auto status = db->CreateColumnFamily(cfOptions, id, &cf); if (!status.ok()) { logRocksDBError(status, "AddCF"); return status; @@ -516,6 +517,7 @@ struct PhysicalShard { rocksdb::DB* db; std::string id; + rocksdb::ColumnFamilyOptions cfOptions; rocksdb::ColumnFamilyHandle* cf = nullptr; std::unordered_map> dataShards; std::shared_ptr readIterPool; @@ -586,7 +588,8 @@ int readRangeInDb(PhysicalShard* shard, const KeyRangeRef range, int rowLimit, i // Manages physical shards and maintains logical shard mapping. class ShardManager { public: - ShardManager(std::string path, UID logId) : path(path), logId(logId), dataShardMap(nullptr, specialKeys.end) {} + ShardManager(std::string path, UID logId, const rocksdb::Options& options) + : path(path), logId(logId), dbOptions(options), dataShardMap(nullptr, specialKeys.end) {} ACTOR static Future shardMetricsLogger(std::shared_ptr rState, Future openFuture, @@ -637,31 +640,31 @@ public: return Void(); } - rocksdb::Status init(rocksdb::Options options) { + rocksdb::Status init() { // Open instance. TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path); std::vector columnFamilies; - rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies); + rocksdb::Status status = rocksdb::DB::ListColumnFamilies(dbOptions, path, &columnFamilies); - rocksdb::ColumnFamilyOptions cfOptions = getCFOptions(); std::vector descriptors; bool foundMetadata = false; for (const auto& name : columnFamilies) { if (name == METADATA_SHARD_ID) { foundMetadata = true; } - descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions }); + descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, rocksdb::ColumnFamilyOptions(dbOptions) }); } ASSERT(foundMetadata || descriptors.size() == 0); // Add default column family if it's a newly opened database. if (descriptors.size() == 0) { - descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ "default", cfOptions }); + descriptors.push_back( + rocksdb::ColumnFamilyDescriptor{ "default", rocksdb::ColumnFamilyOptions(dbOptions) }); } std::vector handles; - status = rocksdb::DB::Open(options, path, descriptors, &handles, &db); + status = rocksdb::DB::Open(dbOptions, path, descriptors, &handles, &db); if (!status.ok()) { logRocksDBError(status, "Open"); return status; @@ -766,7 +769,8 @@ public: physicalShards[defaultShard->id] = defaultShard; // Create metadata shard. - auto metadataShard = std::make_shared(db, METADATA_SHARD_ID); + auto metadataShard = + std::make_shared(db, METADATA_SHARD_ID, rocksdb::ColumnFamilyOptions(dbOptions)); metadataShard->init(); columnFamilyMap[metadataShard->cf->GetID()] = metadataShard->cf; physicalShards[METADATA_SHARD_ID] = metadataShard; @@ -832,7 +836,8 @@ public: } } - auto [it, inserted] = physicalShards.emplace(id, std::make_shared(db, id)); + auto [it, inserted] = physicalShards.emplace( + id, std::make_shared(db, id, rocksdb::ColumnFamilyOptions(dbOptions))); std::shared_ptr& shard = it->second; activePhysicalShardIds.emplace(id); @@ -1146,6 +1151,7 @@ public: private: const std::string path; const UID logId; + rocksdb::Options dbOptions; rocksdb::DB* db = nullptr; std::unordered_map> physicalShards; std::unordered_set activePhysicalShardIds; @@ -1421,40 +1427,40 @@ RocksDBMetrics::RocksDBMetrics(UID debugID, std::shared_ptr } for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) { readRangeLatencyHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds)); readValueLatencyHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds)); readPrefixLatencyHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds)); readRangeActionHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds)); readValueActionHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_ACTION_HISTOGRAM, Histogram::Unit::milliseconds)); readPrefixActionHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_ACTION_HISTOGRAM, Histogram::Unit::milliseconds)); readRangeQueueWaitHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds)); readValueQueueWaitHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds)); readPrefixQueueWaitHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds)); readRangeNewIteratorHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM, Histogram::Unit::milliseconds)); readValueGetHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READVALUE_GET_HISTOGRAM, Histogram::Unit::milliseconds)); readPrefixGetHistograms.push_back(Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::microseconds)); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READPREFIX_GET_HISTOGRAM, Histogram::Unit::milliseconds)); } commitLatencyHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_LATENCY_HISTOGRAM, Histogram::Unit::milliseconds); commitActionHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_ACTION_HISTOGRAM, Histogram::Unit::milliseconds); commitQueueWaitHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM, Histogram::Unit::milliseconds); writeHistogram = - Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::microseconds); + Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_WRITE_HISTOGRAM, Histogram::Unit::milliseconds); deleteCompactRangeHistogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM, Histogram::Unit::milliseconds); } void RocksDBMetrics::logStats(rocksdb::DB* db) { @@ -1689,7 +1695,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { Future readyToStart, std::unordered_map>* physicalShards) { state Reference histogram = Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::microseconds); + ROCKSDBSTORAGE_HISTOGRAM_GROUP, "TimeSpentRefreshIterators"_sr, Histogram::Unit::milliseconds); if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) { try { @@ -1755,7 +1761,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { struct OpenAction : TypedAction { ShardManager* shardManager; - rocksdb::Options dbOptions; ThreadReturnPromise done; Optional>& metrics; const FlowLock* readLock; @@ -1763,19 +1768,18 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { std::shared_ptr errorListener; OpenAction(ShardManager* shardManager, - rocksdb::Options dbOptions, Optional>& metrics, const FlowLock* readLock, const FlowLock* fetchLock, std::shared_ptr errorListener) - : shardManager(shardManager), dbOptions(dbOptions), metrics(metrics), readLock(readLock), - fetchLock(fetchLock), errorListener(errorListener) {} + : shardManager(shardManager), metrics(metrics), readLock(readLock), fetchLock(fetchLock), + errorListener(errorListener) {} double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } }; void action(OpenAction& a) { - auto status = a.shardManager->init(a.dbOptions); + auto status = a.shardManager->init(); if (!status.ok()) { logRocksDBError(status, "Open"); @@ -1886,21 +1890,23 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { rocksdb::DB* db, std::vector>* deletes, bool sample) { - DeleteVisitor dv(deletes); - rocksdb::Status s = batch->Iterate(&dv); - if (!s.ok()) { - logRocksDBError(s, "CommitDeleteVisitor"); - return s; - } + if (SERVER_KNOBS->ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE) { + DeleteVisitor dv(deletes); + rocksdb::Status s = batch->Iterate(&dv); + if (!s.ok()) { + logRocksDBError(s, "CommitDeleteVisitor"); + return s; + } - // If there are any range deletes, we should have added them to be deleted. - ASSERT(!deletes->empty() || !batch->HasDeleteRange()); + // If there are any range deletes, we should have added them to be deleted. + ASSERT(!deletes->empty() || !batch->HasDeleteRange()); + } rocksdb::WriteOptions options; options.sync = !SERVER_KNOBS->ROCKSDB_UNSAFE_AUTO_FSYNC; double writeBeginTime = sample ? timer_monotonic() : 0; - s = db->Write(options, batch); + rocksdb::Status s = db->Write(options, batch); if (sample) { rocksDBMetrics->getWriteHistogram()->sampleSeconds(timer_monotonic() - writeBeginTime); } @@ -2280,7 +2286,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX), numFetchWaiters(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX), errorListener(std::make_shared()), errorFuture(errorListener->getFuture()), - shardManager(path, id), dbOptions(getOptions()), + dbOptions(getOptions()), shardManager(path, id, dbOptions), rocksDBMetrics(std::make_shared(id, dbOptions.statistics)) { // In simluation, run the reader/writer threads as Coro threads (i.e. in the network thread. The storage // engine is still multi-threaded as background compaction threads are still present. Reads/writes to disk @@ -2347,7 +2353,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { // mapping data. } else { auto a = std::make_unique( - &shardManager, dbOptions, metrics, &readSemaphore, &fetchSemaphore, errorListener); + &shardManager, metrics, &readSemaphore, &fetchSemaphore, errorListener); openFuture = a->done.getFuture(); this->metrics = ShardManager::shardMetricsLogger(this->rState, openFuture, &shardManager) && rocksDBAggregatedMetricsLogger(this->rState, openFuture, rocksDBMetrics, &shardManager); @@ -2581,8 +2587,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { std::vector> getDataMapping() { return shardManager.getDataMapping(); } std::shared_ptr rState; - ShardManager shardManager; rocksdb::Options dbOptions; + ShardManager shardManager; std::shared_ptr rocksDBMetrics; std::string path; UID id; diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 3c0cc68a70..5f2b616df6 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -138,7 +138,7 @@ struct LogRouterData { : dbgid(dbgid), logSystem(new AsyncVar>()), version(req.startVersion - 1), minPopped(0), startVersion(req.startVersion), minKnownCommittedVersion(0), poppedVersion(0), routerTag(req.routerTag), allowPops(false), foundEpochEnd(false), generation(req.recoveryCount), - peekLatencyDist(Histogram::getHistogram("LogRouter"_sr, "PeekTLogLatency"_sr, Histogram::Unit::microseconds)), + peekLatencyDist(Histogram::getHistogram("LogRouter"_sr, "PeekTLogLatency"_sr, Histogram::Unit::milliseconds)), cc("LogRouter", dbgid.toString()), getMoreCount("GetMoreCount", cc), getMoreBlockedCount("GetMoreBlockedCount", cc) { // setup just enough of a logSet to be able to call getPushLocations diff --git a/fdbserver/LogSystem.cpp b/fdbserver/LogSystem.cpp index d9dcd7da63..f99af16768 100644 --- a/fdbserver/LogSystem.cpp +++ b/fdbserver/LogSystem.cpp @@ -375,7 +375,7 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) { // parent->child. SpanContextMessage contextMessage; if (spanContext.isSampled()) { - CODE_PROBE(true, "Converting OTELSpanContextMessage to traced SpanContextMessage", probe::decoration::rare); + CODE_PROBE(true, "Converting OTELSpanContextMessage to traced SpanContextMessage"); contextMessage = SpanContextMessage(UID(spanContext.traceID.first(), spanContext.traceID.second())); } else { CODE_PROBE(true, "Converting OTELSpanContextMessage to untraced SpanContextMessage"); diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 5090005c3c..97080ab14c 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -1241,7 +1241,7 @@ ACTOR static Future finishMoveKeys(Database occ, // Set dataMoves[dataMoveId] = DataMoveMetaData. ACTOR static Future startMoveShards(Database occ, UID dataMoveId, - KeyRange keys, + std::vector ranges, std::vector servers, MoveKeysLock lock, FlowLock* startMoveKeysLock, @@ -1257,8 +1257,11 @@ ACTOR static Future startMoveShards(Database occ, TraceEvent(SevDebug, "StartMoveShardsBegin", relocationIntervalId) .detail("DataMoveID", dataMoveId) - .detail("TargetRange", keys); + .detail("TargetRange", describe(ranges)); + // TODO: make startMoveShards work with multiple ranges. + ASSERT(ranges.size() == 1); + state KeyRangeRef keys = ranges[0]; try { state Key begin = keys.begin; state KeyRange currentKeys = keys; @@ -1576,7 +1579,7 @@ ACTOR static Future checkDataMoveComplete(Database occ, UID dataMoveId, Ke // Clear dataMoves[dataMoveId]. ACTOR static Future finishMoveShards(Database occ, UID dataMoveId, - KeyRange targetKeys, + std::vector targetRanges, std::vector destinationTeam, MoveKeysLock lock, FlowLock* finishMoveKeysParallelismLock, @@ -1585,7 +1588,10 @@ ACTOR static Future finishMoveShards(Database occ, std::map tssMapping, const DDEnabledState* ddEnabledState) { ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA); - state KeyRange keys = targetKeys; + + // TODO: make startMoveShards work with multiple ranges. + ASSERT(targetRanges.size() == 1); + state KeyRange keys = targetRanges[0]; state Future warningLogger = logWarningAfter("FinishMoveShardsTooLong", 600, destinationTeam); state int retries = 0; state DataMoveMetaData dataMove; @@ -1636,7 +1642,7 @@ ACTOR static Future finishMoveShards(Database occ, } else { TraceEvent(SevWarn, "FinishMoveShardsDataMoveDeleted", relocationIntervalId) .detail("DataMoveID", dataMoveId); - wait(checkDataMoveComplete(occ, dataMoveId, targetKeys, relocationIntervalId)); + wait(checkDataMoveComplete(occ, dataMoveId, keys, relocationIntervalId)); return Void(); } @@ -2485,9 +2491,10 @@ Future rawStartMovement(Database occ, const MoveKeysParams& params, std::map& tssMapping) { if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); return startMoveShards(std::move(occ), params.dataMoveId, - params.keys, + params.ranges.get(), params.destinationTeam, params.lock, params.startMoveKeysParallelismLock, @@ -2495,8 +2502,9 @@ Future rawStartMovement(Database occ, params.ddEnabledState, params.cancelConflictingDataMoves); } + ASSERT(params.keys.present()); return startMoveKeys(std::move(occ), - params.keys, + params.keys.get(), params.destinationTeam, params.lock, params.startMoveKeysParallelismLock, @@ -2505,13 +2513,37 @@ Future rawStartMovement(Database occ, params.ddEnabledState); } +Future rawCheckFetchingState(const Database& cx, + const MoveKeysParams& params, + const std::map& tssMapping) { + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); + // TODO: make startMoveShards work with multiple ranges. + ASSERT(params.ranges.get().size() == 1); + return checkFetchingState(cx, + params.healthyDestinations, + params.ranges.get().at(0), + params.dataMovementComplete, + params.relocationIntervalId, + tssMapping); + } + ASSERT(params.keys.present()); + return checkFetchingState(cx, + params.healthyDestinations, + params.keys.get(), + params.dataMovementComplete, + params.relocationIntervalId, + tssMapping); +} + Future rawFinishMovement(Database occ, const MoveKeysParams& params, const std::map& tssMapping) { if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); return finishMoveShards(std::move(occ), params.dataMoveId, - params.keys, + params.ranges.get(), params.destinationTeam, params.lock, params.finishMoveKeysParallelismLock, @@ -2520,8 +2552,9 @@ Future rawFinishMovement(Database occ, tssMapping, params.ddEnabledState); } + ASSERT(params.keys.present()); return finishMoveKeys(std::move(occ), - params.keys, + params.keys.get(), params.destinationTeam, params.lock, params.finishMoveKeysParallelismLock, @@ -2539,12 +2572,7 @@ ACTOR Future moveKeys(Database occ, MoveKeysParams params) { wait(rawStartMovement(occ, params, tssMapping)); - state Future completionSignaller = checkFetchingState(occ, - params.healthyDestinations, - params.keys, - params.dataMovementComplete, - params.relocationIntervalId, - tssMapping); + state Future completionSignaller = rawCheckFetchingState(occ, params, tssMapping); wait(rawFinishMovement(occ, params, tssMapping)); diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp index 888a8052b0..6d8eebd9d2 100644 --- a/fdbserver/MutationTracking.cpp +++ b/fdbserver/MutationTracking.cpp @@ -98,7 +98,6 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri SpanContextMessage scm; br >> scm; } else if (OTELSpanContextMessage::startsOTELSpanContextMessage(mutationType)) { - CODE_PROBE(true, "MutationTracking reading OTELSpanContextMessage", probe::decoration::rare); BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion())); OTELSpanContextMessage scm; br >> scm; diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index 65d181873a..0dc214d29f 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -1633,7 +1633,7 @@ ACTOR Future tLog(IKeyValueStore* persistentData, TraceEvent("SharedTlog", tlogId).detail("Version", "4.6"); try { - wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit")); wait(restorePersistentState(&self, locality)); self.sharedActors.send(cleanupPeekTrackers(&self)); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index a346d214ff..4ebac5dc6d 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1484,7 +1484,7 @@ ACTOR Future doQueueCommit(TLogData* self, self->largeDiskQueueCommitBytes.set(false); wait(ioDegradedOrTimeoutError( - c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION)); + c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION, "TLogCommit")); if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) { wait(delay(6.0)); } @@ -1701,7 +1701,7 @@ ACTOR Future initPersistentState(TLogData* self, Reference logDat } TraceEvent("TLogInitCommit", logData->logId).log(); - wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + wait(ioTimeoutError(self->persistentData->commit(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogCommit")); return Void(); } @@ -2801,13 +2801,13 @@ ACTOR Future tLog(IKeyValueStore* persistentData, TraceEvent("SharedTlog", tlogId).detail("Version", "6.0"); try { - wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit")); if (restoreFromDisk) { wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests)); } else { - wait(ioTimeoutError(checkEmptyQueue(&self) && checkRecovered(&self), - SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + wait(ioTimeoutError( + checkEmptyQueue(&self) && checkRecovered(&self), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit")); } // Disk errors need a chance to kill this actor. diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index c0261622bb..248bfe6b3e 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -3291,7 +3291,7 @@ ACTOR Future tLog(IKeyValueStore* persistentData, TraceEvent("SharedTlog", tlogId).detail("Version", "6.2"); try { - wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit")); if (restoreFromDisk) { wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests)); diff --git a/fdbserver/PaxosConfigConsumer.actor.cpp b/fdbserver/PaxosConfigConsumer.actor.cpp index 3515c1db5e..1d476cd749 100644 --- a/fdbserver/PaxosConfigConsumer.actor.cpp +++ b/fdbserver/PaxosConfigConsumer.actor.cpp @@ -487,12 +487,12 @@ class PaxosConfigConsumerImpl { .detail("LargestLiveVersion", self->getCommittedVersionQuorum.getLargestLive()) .detail("SmallestCommitted", smallestCommitted); ASSERT_GE(committedVersion, self->lastSeenVersion); - self->lastSeenVersion = committedVersion; + self->lastSeenVersion = std::max(self->lastSeenVersion, committedVersion); self->compactionVersion = std::max(self->compactionVersion, smallestCommitted); broadcaster->applySnapshotAndChanges(std::move(reply.snapshot), reply.snapshotVersion, reply.changes, - committedVersion, + self->lastSeenVersion, reply.annotations, self->getCommittedVersionQuorum.getReadReplicas(), self->getCommittedVersionQuorum.getLargestLive(), @@ -534,6 +534,13 @@ class PaxosConfigConsumerImpl { if (committedVersion > self->lastSeenVersion) { ASSERT(self->getCommittedVersionQuorum.getReadReplicas().size() >= self->cfis.size() / 2 + 1 || self->getCommittedVersionQuorum.isSpecialZeroQuorum()); + if (BUGGIFY) { + // Inject a random delay between getting the committed + // version and reading any changes. The goal is to + // allow attrition to occasionally kill ConfigNodes in + // this in-between state. + wait(delay(deterministicRandom()->random01() * 5)); + } state std::vector readReplicas = self->getCommittedVersionQuorum.getReadReplicas(); std::vector> fs; @@ -567,7 +574,7 @@ class PaxosConfigConsumerImpl { Version smallestCommitted = self->getCommittedVersionQuorum.getSmallestCommitted(); self->compactionVersion = std::max(self->compactionVersion, smallestCommitted); broadcaster->applyChanges(reply.changes, - committedVersion, + self->lastSeenVersion, reply.annotations, self->getCommittedVersionQuorum.getReadReplicas()); } else if (committedVersion == self->lastSeenVersion) { diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp index 53f41085e8..006a67aefc 100644 --- a/fdbserver/RocksDBCheckpointUtils.actor.cpp +++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp @@ -43,9 +43,9 @@ #include "flow/actorcompiler.h" // has to be last include #ifdef SSD_ROCKSDB_EXPERIMENTAL -// Enforcing rocksdb version to be 6.22.1 or greater. -static_assert(ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR >= 22 && ROCKSDB_PATCH >= 1, - "Unsupported rocksdb version. Update the rocksdb to at least 6.22.1 version"); +// Enforcing rocksdb version to be 7.7.3. +static_assert((ROCKSDB_MAJOR == 7 && ROCKSDB_MINOR == 7 && ROCKSDB_PATCH == 3), + "Unsupported rocksdb version. Update the rocksdb to 7.7.3 version"); namespace { diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 0af18bc32f..2cc7941683 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -328,6 +328,13 @@ class TestConfig : public BasicTestConfig { if (attrib == "disableEncryption") { disableEncryption = strcmp(value.c_str(), "true") == 0; } + if (attrib == "encryptModes") { + std::stringstream ss(value); + std::string token; + while (std::getline(ss, token, ',')) { + encryptModes.push_back(token); + } + } if (attrib == "restartInfoLocation") { isFirstTestInRestart = true; } @@ -397,6 +404,9 @@ public: bool disableRemoteKVS = false; // 7.2 cannot be downgraded to 7.1 or below after enabling encryption-at-rest. bool disableEncryption = false; + // By default, encryption mode is set randomly (based on the tenant mode) + // If provided, set using EncryptionAtRestMode::fromString + std::vector encryptModes; // Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig // 0 = "ssd" // 1 = "memory" @@ -474,6 +484,7 @@ public: .add("disableHostname", &disableHostname) .add("disableRemoteKVS", &disableRemoteKVS) .add("disableEncryption", &disableEncryption) + .add("encryptModes", &encryptModes) .add("simpleConfig", &simpleConfig) .add("generateFearless", &generateFearless) .add("datacenters", &datacenters) @@ -1274,6 +1285,7 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false })); TraceEvent(SevDebug, "DisableRemoteKVS"); } + // TODO: Remove this code when encryption knobs are removed if (testConfig->disableEncryption) { g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false })); g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false })); @@ -2052,6 +2064,19 @@ void setupSimulatedSystem(std::vector>* systemActors, simconfig.db.tenantMode = tenantMode; simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::DISABLED; + if (!testConfig.encryptModes.empty()) { + simconfig.db.encryptionAtRestMode = + EncryptionAtRestMode::fromString(deterministicRandom()->randomChoice(testConfig.encryptModes)); + } else if (!testConfig.disableEncryption && deterministicRandom()->coinflip()) { + if (tenantMode == TenantMode::DISABLED || tenantMode == TenantMode::OPTIONAL_TENANT || + deterministicRandom()->coinflip()) { + // optional and disabled tenant modes currently only support cluster aware encryption + simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::CLUSTER_AWARE; + } else { + simconfig.db.encryptionAtRestMode = EncryptionAtRestMode::DOMAIN_AWARE; + } + } + TraceEvent("SimulatedClusterEncryptionMode").detail("Mode", simconfig.db.encryptionAtRestMode.toString()); g_simulator->blobGranulesEnabled = simconfig.db.blobGranulesEnabled; @@ -2065,6 +2090,7 @@ void setupSimulatedSystem(std::vector>* systemActors, g_knobs.setKnob("remote_kv_store", KnobValueRef::create(bool{ false })); TraceEvent(SevDebug, "DisableRemoteKVS"); } + // TODO: Remove this code once encryption knobs are removed if (testConfig.disableEncryption) { g_knobs.setKnob("enable_encryption", KnobValueRef::create(bool{ false })); g_knobs.setKnob("enable_tlog_encryption", KnobValueRef::create(bool{ false })); diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index ea973496ac..911d940dd6 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -19,6 +19,8 @@ */ #include +#include "fdbclient/BlobGranuleCommon.h" +#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "fmt/format.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BlobWorkerInterface.h" @@ -2443,6 +2445,47 @@ ACTOR static Future blobWorkerStatusFetcher( return statusObj; } +ACTOR static Future blobRestoreStatusFetcher(Database db, std::set* incompleteReason) { + + state JsonBuilderObject statusObj; + state std::vector>> futures; + + try { + Optional status = wait(getRestoreStatus(db, normalKeys)); + if (status.present()) { + switch (status.get().phase) { + case BlobRestorePhase::INIT: + statusObj["blob_full_restore_phase"] = "Initializing"; + break; + case BlobRestorePhase::LOAD_MANIFEST: + statusObj["blob_full_restore_phase"] = "Loading manifest"; + break; + case BlobRestorePhase::MANIFEST_DONE: + statusObj["blob_full_restore_phase"] = "Manifest loaded"; + break; + case BlobRestorePhase::MIGRATE: + statusObj["blob_full_restore_phase"] = "Copying data"; + statusObj["blob_full_restore_progress"] = status.get().progress; + break; + case BlobRestorePhase::APPLY_MLOGS: + statusObj["blob_full_restore_phase"] = "Applying mutation logs"; + statusObj["blob_full_restore_progress"] = status.get().progress; + break; + case BlobRestorePhase::DONE: + statusObj["blob_full_restore_phase"] = "Completed"; + break; + default: + statusObj["blob_full_restore_phase"] = "Unexpected phase"; + } + } + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + incompleteReason->insert("Unable to query blob restore status"); + } + return statusObj; +} + static JsonBuilderObject tlogFetcher(int* logFaultTolerance, const std::vector& tLogs, std::unordered_map const& address_workers) { @@ -3409,6 +3452,8 @@ ACTOR Future clusterGetStatus( JsonBuilderObject blobGranuelsStatus = wait(blobWorkerStatusFetcher(blobWorkers, address_workers, &status_incomplete_reasons)); statusObj["blob_granules"] = blobGranuelsStatus; + JsonBuilderObject blobRestoreStatus = wait(blobRestoreStatusFetcher(cx, &status_incomplete_reasons)); + statusObj["blob_restore"] = blobRestoreStatus; } JsonBuilderArray incompatibleConnectionsArray; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 793f01628e..3ec15bda8f 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -375,7 +375,7 @@ struct TLogData : NonCopyable { peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES), concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopDeadline(0), dataFolder(folder), degraded(degraded), - commitLatencyDist(Histogram::getHistogram("tLog"_sr, "commit"_sr, Histogram::Unit::microseconds)) { + commitLatencyDist(Histogram::getHistogram("tLog"_sr, "commit"_sr, Histogram::Unit::milliseconds)) { cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); } }; @@ -1098,7 +1098,7 @@ ACTOR Future updatePersistentData(TLogData* self, Reference logDa } // SOMEDAY: This seems to be running pretty often, should we slow it down??? // This needs a timeout since nothing prevents I/O operations from hanging indefinitely. - wait(ioTimeoutError(self->persistentData->commit(), tLogMaxCreateDuration)); + wait(ioTimeoutError(self->persistentData->commit(), tLogMaxCreateDuration, "TLogCommit")); wait(delay(0, TaskPriority::UpdateStorage)); @@ -2160,7 +2160,7 @@ ACTOR Future doQueueCommit(TLogData* self, self->largeDiskQueueCommitBytes.set(false); wait(ioDegradedOrTimeoutError( - c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION)); + c, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, self->degraded, SERVER_KNOBS->TLOG_DEGRADED_DURATION, "TLogCommit")); if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.0001)) { wait(delay(6.0)); } @@ -3464,7 +3464,8 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality logData->unpoppedRecoveredTagCount = req.allTags.size(); logData->unpoppedRecoveredTags = std::set(req.allTags.begin(), req.allTags.end()); wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed, - SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, + "TLogInit")); TraceEvent("TLogRecover", self->dbgid) .detail("LogId", logData->logId) @@ -3529,7 +3530,8 @@ ACTOR Future tLogStart(TLogData* self, InitializeTLogRequest req, Locality } else { // Brand new tlog, initialization has already been done by caller wait(ioTimeoutError(initPersistentState(self, logData) || logData->removed, - SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, + "TLogInit")); if (logData->recoveryComplete.isSet()) { throw worker_removed(); @@ -3600,13 +3602,14 @@ ACTOR Future tLog(IKeyValueStore* persistentData, TraceEvent("SharedTlog", tlogId); try { - wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, "TLogInit")); if (restoreFromDisk) { wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests)); } else { wait(ioTimeoutError(checkEmptyQueue(&self) && initPersistentStorage(&self), - SERVER_KNOBS->TLOG_MAX_CREATE_DURATION)); + SERVER_KNOBS->TLOG_MAX_CREATE_DURATION, + "TLogInit")); } // Disk errors need a chance to kill this actor. diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index bacbe8ac66..01b31d7433 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -546,7 +546,7 @@ Future TagPartitionedLogSystem::push(Version prevVersion, it->tlogPushDistTrackers.push_back( Histogram::getHistogram("ToTlog_" + it->logServers[i]->get().interf().uniqueID.toString(), it->logServers[i]->get().interf().address().toString(), - Histogram::Unit::microseconds)); + Histogram::Unit::milliseconds)); } } std::vector> tLogCommitResults; diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index 100ae28682..3025f5cf94 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -124,9 +124,17 @@ public: state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; state double lastTenantListFetchTime = now(); + state double lastTraceTime = 0; loop { state double fetchStartTime = now(); + + state bool toTrace = false; + if (fetchStartTime - lastTraceTime > SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_TRACE_INTERVAL) { + toTrace = true; + lastTraceTime = fetchStartTime; + } + state std::vector groups; for (const auto& [group, storage] : tenantCache->tenantStorageMap) { groups.push_back(group); @@ -159,6 +167,14 @@ public: } } tenantCache->tenantStorageMap[group].usage = usage; + + if (toTrace) { + // Trace the storage used by all tenant groups for visibility. + TraceEvent(SevInfo, "StorageUsageUpdated", tenantCache->id()) + .detail("TenantGroup", group) + .detail("Quota", tenantCache->tenantStorageMap[group].quota) + .detail("Usage", tenantCache->tenantStorageMap[group].usage); + } } lastTenantListFetchTime = now(); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 2ff0008695..c8fe984a86 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -459,7 +459,13 @@ public: // Since cursors can have async operations pending which modify their state they can't be copied cleanly Cursor(const Cursor& other) = delete; - ~Cursor() { writeOperations.cancel(); } + ~Cursor() { cancel(); } + + // Cancel outstanding operations. Further use of cursor is not allowed. + void cancel() { + nextPageReader.cancel(); + writeOperations.cancel(); + } // A read cursor can be initialized from a pop cursor void initReadOnly(const Cursor& c, bool readExtents = false) { @@ -921,7 +927,15 @@ public: public: FIFOQueue() : pager(nullptr) {} - ~FIFOQueue() { newTailPage.cancel(); } + ~FIFOQueue() { cancel(); } + + // Cancel outstanding operations. Further use of queue is not allowed. + void cancel() { + headReader.cancel(); + tailWriter.cancel(); + headWriter.cancel(); + newTailPage.cancel(); + } FIFOQueue(const FIFOQueue& other) = delete; void operator=(const FIFOQueue& rhs) = delete; @@ -3627,6 +3641,13 @@ public: } self->operations.clear(); + debug_printf("DWALPager(%s) shutdown cancel queues\n", self->filename.c_str()); + self->freeList.cancel(); + self->delayedFreeList.cancel(); + self->remapQueue.cancel(); + self->extentFreeList.cancel(); + self->extentUsedList.cancel(); + debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str()); wait(self->extentCache.clear()); wait(self->pageCache.clear()); @@ -4697,21 +4718,15 @@ public: if (domainId.present()) { ASSERT(keyProvider && keyProvider->enableEncryptionDomain()); - // Temporarily disabling the check, since if a tenant is removed, where the key provider - // would not find the domain, the data for the tenant may still be in Redwood and being read. - // TODO(yiwu): re-enable the check. - /* - if (domainId.get() != keyProvider->getDefaultEncryptionDomainId() && - !keyProvider->keyFitsInDomain(domainId.get(), lowerBound, false)) { - fprintf(stderr, - "Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n", - ::toString(id).c_str(), - ::toString(v).c_str(), - ::toString(domainId).c_str(), - lowerBound.printable().c_str()); - return false; + if (!keyProvider->keyFitsInDomain(domainId.get(), lowerBound, true)) { + fprintf(stderr, + "Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n", + ::toString(id).c_str(), + ::toString(v).c_str(), + ::toString(domainId).c_str(), + lowerBound.printable().c_str()); + return false; } - */ } auto& b = boundariesByPageID[id.front()][v]; @@ -4759,45 +4774,27 @@ public: ::toString(b->second.domainId).c_str()); return false; } - // Temporarily disabling the check, since if a tenant is removed, where the key provider - // would not find the domain, the data for the tenant may still be in Redwood and being read. - // TODO(yiwu): re-enable the check. - /* ASSERT(domainId.present()); auto checkKeyFitsInDomain = [&]() -> bool { - if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) { - fprintf(stderr, - "Encryption domain mismatch on %s, %s, domain: %s, key %s\n", - ::toString(id).c_str(), - ::toString(v).c_str(), - ::toString(domainId).c_str(), - cursor.get().key.printable().c_str()); - return false; - } - return true; + if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) { + fprintf(stderr, + "Encryption domain mismatch on %s, %s, domain: %s, key %s\n", + ::toString(id).c_str(), + ::toString(v).c_str(), + ::toString(domainId).c_str(), + cursor.get().key.printable().c_str()); + return false; + } + return true; }; - if (domainId.get() != keyProvider->getDefaultEncryptionDomainId()) { - cursor.moveFirst(); - if (cursor.valid() && !checkKeyFitsInDomain()) { - return false; - } - cursor.moveLast(); - if (cursor.valid() && !checkKeyFitsInDomain()) { - return false; - } - } else { - if (deterministicRandom()->random01() < domainPrefixScanProbability) { - cursor.moveFirst(); - while (cursor.valid()) { - if (!checkKeyFitsInDomain()) { - return false; - } - cursor.moveNext(); - } - domainPrefixScanCount++; - } + cursor.moveFirst(); + if (cursor.valid() && !checkKeyFitsInDomain()) { + return false; + } + cursor.moveLast(); + if (cursor.valid() && !checkKeyFitsInDomain()) { + return false; } - */ } return true; @@ -5674,8 +5671,8 @@ private: int64_t defaultDomainId = keyProvider->getDefaultEncryptionDomainId(); int64_t currentDomainId; size_t prefixLength; - if (count == 0 || (splitByDomain && count > 0)) { - std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key, domainId); + if (count == 0 || splitByDomain) { + std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key); } if (count == 0) { domainId = currentDomainId; @@ -5886,12 +5883,18 @@ private: if (useEncryptionDomain) { ASSERT(pagesToBuild[0].domainId.present()); int64_t domainId = pagesToBuild[0].domainId.get(); - // We need to make sure we use the domain prefix as the page lower bound, for the first page - // of a non-default domain on a level. That way we ensure that pages for a domain form a full subtree - // (i.e. have a single root) in the B-tree. - if (domainId != self->m_keyProvider->getDefaultEncryptionDomainId() && - !self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, false)) { - pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength)); + // We make sure the page lower bound fits in the domain of the page. + // If the page domain is the default domain, we make sure the page doesn't fall within a domain + // specific subtree. + // If the page domain is non-default, in addition, we make the first page of the domain on a level + // use the domain prefix as the lower bound. Such a lower bound will ensure that pages for a domain + // form a full subtree (i.e. have a single root) in the B-tree. + if (!self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, true)) { + if (domainId == self->m_keyProvider->getDefaultEncryptionDomainId()) { + pageLowerBound = RedwoodRecordRef(entries[0].key); + } else { + pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength)); + } } } diff --git a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h index 4e39e53ee9..6a37876313 100644 --- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h +++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h @@ -163,7 +163,8 @@ ACTOR Future printRestoreSummary(Database db, Reference listBlobGranules(Database db, Reference blobConn); ACTOR Future lastBlobEpoc(Database db, Reference blobConn); ACTOR Future isFullRestoreMode(Database db, KeyRangeRef range); - +ACTOR Future updateRestoreStatus(Database db, KeyRangeRef range, BlobRestoreStatus status); +ACTOR Future> getRestoreStatus(Database db, KeyRangeRef range); #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/include/fdbserver/ClusterController.actor.h b/fdbserver/include/fdbserver/ClusterController.actor.h index a4ceea4592..a0079e584a 100644 --- a/fdbserver/include/fdbserver/ClusterController.actor.h +++ b/fdbserver/include/fdbserver/ClusterController.actor.h @@ -920,7 +920,7 @@ public: } if (fitness == ProcessClass::NeverAssign) { logWorkerUnavailable( - SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + SevDebug, id, "simple", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); continue; } if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { @@ -1072,7 +1072,7 @@ public: } if (fitness == ProcessClass::NeverAssign) { logWorkerUnavailable( - SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + SevDebug, id, "deprecated", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); continue; } if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { diff --git a/fdbserver/include/fdbserver/ConfigFollowerInterface.h b/fdbserver/include/fdbserver/ConfigFollowerInterface.h index aef438255c..ed1e263573 100644 --- a/fdbserver/include/fdbserver/ConfigFollowerInterface.h +++ b/fdbserver/include/fdbserver/ConfigFollowerInterface.h @@ -110,8 +110,7 @@ struct ConfigFollowerGetChangesReply { Standalone> annotations; ConfigFollowerGetChangesReply() = default; - explicit ConfigFollowerGetChangesReply(Version mostRecentVersion, - Standalone> const& changes, + explicit ConfigFollowerGetChangesReply(Standalone> const& changes, Standalone> const& annotations) : changes(changes), annotations(annotations) {} diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 9054ab55a3..40143c3109 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -284,12 +284,12 @@ public: const std::unordered_set& excludedPhysicalShards, uint64_t debugID); - // Step 2: get a remote team which has the input physical shard - // Return empty if no such remote team - // May return a problematic remote team, and re-selection is required for this case - Optional tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID, - StorageMetrics const& moveInMetrics, - uint64_t debugID); + // Step 2: get a remote team which has the input physical shard. + // Second field in the returned pair indicates whether this physical shard is available or not. + // Return empty if no such remote team. + // May return a problematic remote team, and re-selection is required for this case. + std::pair, bool> + tryGetAvailableRemoteTeamWith(uint64_t inputPhysicalShardID, StorageMetrics const& moveInMetrics, uint64_t debugID); // Invariant: // (1) If forceToUseNewPhysicalShard is set, use the bestTeams selected by getTeam(), and create a new physical // shard for the teams diff --git a/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h b/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h index aa8a2c1e6b..ae2d52e113 100644 --- a/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h +++ b/fdbserver/include/fdbserver/IPageEncryptionKeyProvider.actor.h @@ -90,21 +90,11 @@ public: virtual int64_t getDefaultEncryptionDomainId() const { throw not_implemented(); } // Get encryption domain from a key. Return the domain id, and the size of the encryption domain prefix. - // It is assumed that all keys with the same encryption domain prefix as the given key falls in the same encryption - // domain. If possibleDomainId is given, it is a valid domain id previously returned by the key provider, - // potentially for a different key. The possibleDomainId parm is used by TenantAwareEncryptionKeyProvider to speed - // up encryption domain lookup. - virtual std::tuple getEncryptionDomain(const KeyRef& key, - Optional possibleDomainId = Optional()) { - throw not_implemented(); - } + virtual std::tuple getEncryptionDomain(const KeyRef& key) { throw not_implemented(); } // Get encryption domain of a page given encoding header. virtual int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) { throw not_implemented(); } - // Setting tenant prefix to tenant name map. Used by TenantAwareEncryptionKeyProvider. - virtual void setTenantPrefixIndex(Reference tenantPrefixIndex) {} - // Helper methods. // Check if a key fits in an encryption domain. @@ -220,7 +210,7 @@ public: int64_t getDefaultEncryptionDomainId() const override { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; } - std::tuple getEncryptionDomain(const KeyRef& key, Optional) override { + std::tuple getEncryptionDomain(const KeyRef& key) override { int64_t domainId; if (key.size() < PREFIX_LENGTH) { domainId = getDefaultEncryptionDomainId(); @@ -291,6 +281,8 @@ class TenantAwareEncryptionKeyProvider : public IPageEncryptionKeyProvider { public: using EncodingHeader = ArenaPage::AESEncryptionV1Encoder::Header; + const StringRef systemKeysPrefix = systemKeys.begin; + TenantAwareEncryptionKeyProvider(Reference const> db) : db(db) {} virtual ~TenantAwareEncryptionKeyProvider() = default; @@ -337,10 +329,10 @@ public: int64_t getDefaultEncryptionDomainId() const override { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; } - std::tuple getEncryptionDomain(const KeyRef& key, Optional possibleDomainId) override { + std::tuple getEncryptionDomain(const KeyRef& key) override { // System key. - if (key.startsWith(systemKeys.begin)) { - return { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, 2 }; + if (key.startsWith(systemKeysPrefix)) { + return { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, systemKeysPrefix.size() }; } // Key smaller than tenant prefix in size belongs to the default domain. if (key.size() < TENANT_PREFIX_SIZE) { @@ -352,21 +344,7 @@ public: if (tenantId < 0) { return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 }; } - // Optimization: Caller guarantee possibleDomainId is a valid domain id that we previously returned. - // We can return immediately without checking with tenant map. - if (possibleDomainId.present() && possibleDomainId.get() == tenantId) { - return { tenantId, TENANT_PREFIX_SIZE }; - } - if (tenantPrefixIndex.isValid()) { - auto view = tenantPrefixIndex->atLatest(); - auto itr = view.find(prefix); - if (itr != view.end()) { - // Tenant not found. Tenant must be disabled, or in optional mode. - return { tenantId, TENANT_PREFIX_SIZE }; - } - } - // The prefix does not belong to any tenant. The key belongs to the default domain. - return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 }; + return { tenantId, TENANT_PREFIX_SIZE }; } int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) override { @@ -375,13 +353,8 @@ public: return header->cipherTextDetails.encryptDomainId; } - void setTenantPrefixIndex(Reference tenantPrefixIndex) override { - this->tenantPrefixIndex = tenantPrefixIndex; - } - private: Reference const> db; - Reference tenantPrefixIndex; }; #include "flow/unactorcompiler.h" diff --git a/fdbserver/include/fdbserver/MoveKeys.actor.h b/fdbserver/include/fdbserver/MoveKeys.actor.h index ed027a29fa..5c5c929df5 100644 --- a/fdbserver/include/fdbserver/MoveKeys.actor.h +++ b/fdbserver/include/fdbserver/MoveKeys.actor.h @@ -58,7 +58,12 @@ public: struct MoveKeysParams { UID dataMoveId; - KeyRange keys; + + // Only one of `keys` and `ranges` can be set. `ranges` is created mainly for physical shard moves to move a full + // physical shard with multiple key ranges. + Optional keys; + Optional> ranges; + std::vector destinationTeam, healthyDestinations; MoveKeysLock lock; Promise dataMovementComplete; @@ -68,6 +73,46 @@ struct MoveKeysParams { UID relocationIntervalId; const DDEnabledState* ddEnabledState = nullptr; CancelConflictingDataMoves cancelConflictingDataMoves = CancelConflictingDataMoves::False; + + MoveKeysParams() {} + + MoveKeysParams(UID dataMoveId, + const KeyRange& keys, + const std::vector& destinationTeam, + const std::vector& healthyDestinations, + const MoveKeysLock& lock, + const Promise& dataMovementComplete, + FlowLock* startMoveKeysParallelismLock, + FlowLock* finishMoveKeysParallelismLock, + bool hasRemote, + UID relocationIntervalId, + const DDEnabledState* ddEnabledState, + CancelConflictingDataMoves cancelConflictingDataMoves) + : dataMoveId(dataMoveId), keys(keys), destinationTeam(destinationTeam), healthyDestinations(healthyDestinations), + lock(lock), dataMovementComplete(dataMovementComplete), + startMoveKeysParallelismLock(startMoveKeysParallelismLock), + finishMoveKeysParallelismLock(finishMoveKeysParallelismLock), hasRemote(hasRemote), + relocationIntervalId(relocationIntervalId), ddEnabledState(ddEnabledState), + cancelConflictingDataMoves(cancelConflictingDataMoves) {} + + MoveKeysParams(UID dataMoveId, + const std::vector& ranges, + const std::vector& destinationTeam, + const std::vector& healthyDestinations, + const MoveKeysLock& lock, + const Promise& dataMovementComplete, + FlowLock* startMoveKeysParallelismLock, + FlowLock* finishMoveKeysParallelismLock, + bool hasRemote, + UID relocationIntervalId, + const DDEnabledState* ddEnabledState, + CancelConflictingDataMoves cancelConflictingDataMoves) + : dataMoveId(dataMoveId), ranges(ranges), destinationTeam(destinationTeam), + healthyDestinations(healthyDestinations), lock(lock), dataMovementComplete(dataMovementComplete), + startMoveKeysParallelismLock(startMoveKeysParallelismLock), + finishMoveKeysParallelismLock(finishMoveKeysParallelismLock), hasRemote(hasRemote), + relocationIntervalId(relocationIntervalId), ddEnabledState(ddEnabledState), + cancelConflictingDataMoves(cancelConflictingDataMoves) {} }; // read the lock value in system keyspace but do not change anything diff --git a/fdbserver/include/fdbserver/ProxyCommitData.actor.h b/fdbserver/include/fdbserver/ProxyCommitData.actor.h index d8db57a650..f40aa64285 100644 --- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h @@ -137,16 +137,16 @@ struct ProxyStats { SERVER_KNOBS->LATENCY_SKETCH_ACCURACY), maxComputeNS(0), minComputeNS(1e12), commitBatchQueuingDist( - Histogram::getHistogram("CommitProxy"_sr, "CommitBatchQueuing"_sr, Histogram::Unit::microseconds)), + Histogram::getHistogram("CommitProxy"_sr, "CommitBatchQueuing"_sr, Histogram::Unit::milliseconds)), getCommitVersionDist( - Histogram::getHistogram("CommitProxy"_sr, "GetCommitVersion"_sr, Histogram::Unit::microseconds)), - resolutionDist(Histogram::getHistogram("CommitProxy"_sr, "Resolution"_sr, Histogram::Unit::microseconds)), + Histogram::getHistogram("CommitProxy"_sr, "GetCommitVersion"_sr, Histogram::Unit::milliseconds)), + resolutionDist(Histogram::getHistogram("CommitProxy"_sr, "Resolution"_sr, Histogram::Unit::milliseconds)), postResolutionDist( - Histogram::getHistogram("CommitProxy"_sr, "PostResolutionQueuing"_sr, Histogram::Unit::microseconds)), + Histogram::getHistogram("CommitProxy"_sr, "PostResolutionQueuing"_sr, Histogram::Unit::milliseconds)), processingMutationDist( - Histogram::getHistogram("CommitProxy"_sr, "ProcessingMutation"_sr, Histogram::Unit::microseconds)), - tlogLoggingDist(Histogram::getHistogram("CommitProxy"_sr, "TlogLogging"_sr, Histogram::Unit::microseconds)), - replyCommitDist(Histogram::getHistogram("CommitProxy"_sr, "ReplyCommit"_sr, Histogram::Unit::microseconds)) { + Histogram::getHistogram("CommitProxy"_sr, "ProcessingMutation"_sr, Histogram::Unit::milliseconds)), + tlogLoggingDist(Histogram::getHistogram("CommitProxy"_sr, "TlogLogging"_sr, Histogram::Unit::milliseconds)), + replyCommitDist(Histogram::getHistogram("CommitProxy"_sr, "ReplyCommit"_sr, Histogram::Unit::milliseconds)) { specialCounter(cc, "LastAssignedCommitVersion", [this]() { return this->lastCommitVersionAssigned; }); specialCounter(cc, "Version", [pVersion]() { return pVersion->get(); }); specialCounter(cc, "CommittedVersion", [pCommittedVersion]() { return pCommittedVersion->get(); }); diff --git a/fdbserver/include/fdbserver/WorkerInterface.actor.h b/fdbserver/include/fdbserver/WorkerInterface.actor.h index 2755968190..f3af110042 100644 --- a/fdbserver/include/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/include/fdbserver/WorkerInterface.actor.h @@ -1284,7 +1284,7 @@ ACTOR Future tLog(IKeyValueStore* persistentData, typedef decltype(&tLog) TLogFn; ACTOR template -Future ioTimeoutError(Future what, double time) { +Future ioTimeoutError(Future what, double time, const char* context = nullptr) { // Before simulation is sped up, IO operations can take a very long time so limit timeouts // to not end until at least time after simulation is sped up. if (g_network->isSimulated() && !g_simulator->speedUpSimulation) { @@ -1298,7 +1298,12 @@ Future ioTimeoutError(Future what, double time) { if (g_network->isSimulated() && !g_simulator->getCurrentProcess()->isReliable()) { err = err.asInjectedFault(); } - TraceEvent(SevError, "IoTimeoutError").error(err); + TraceEvent e(SevError, "IoTimeoutError"); + e.error(err); + if (context != nullptr) { + e.detail("Context", context); + } + e.log(); throw err; } } @@ -1308,7 +1313,8 @@ ACTOR template Future ioDegradedOrTimeoutError(Future what, double errTime, Reference> degraded, - double degradedTime) { + double degradedTime, + const char* context = nullptr) { // Before simulation is sped up, IO operations can take a very long time so limit timeouts // to not end until at least time after simulation is sped up. if (g_network->isSimulated() && !g_simulator->speedUpSimulation) { @@ -1337,7 +1343,12 @@ Future ioDegradedOrTimeoutError(Future what, if (g_network->isSimulated() && !g_simulator->getCurrentProcess()->isReliable()) { err = err.asInjectedFault(); } - TraceEvent(SevError, "IoTimeoutError").error(err); + TraceEvent e(SevError, "IoTimeoutError"); + e.error(err); + if (context != nullptr) { + e.detail("Context", context); + } + e.log(); throw err; } } diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 91a68cf6a9..3af9a5b905 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -159,8 +159,7 @@ bool canReplyWith(Error e) { #define PERSIST_PREFIX "\xff\xff" -FDB_DECLARE_BOOLEAN_PARAM(UnlimitedCommitBytes); -FDB_DEFINE_BOOLEAN_PARAM(UnlimitedCommitBytes); +FDB_BOOLEAN_PARAM(UnlimitedCommitBytes); // Immutable static const KeyValueRef persistFormat(PERSIST_PREFIX "Format"_sr, "FoundationDB/StorageServer/1/4"_sr); @@ -786,7 +785,7 @@ public: std::map> pendingCheckpoints; // Pending checkpoint requests std::unordered_map checkpoints; // Existing and deleting checkpoints TenantMap tenantMap; - Reference tenantPrefixIndex; + TenantPrefixIndex tenantPrefixIndex; std::map> pendingAddRanges; // Pending requests to add ranges to physical shards std::map> @@ -805,7 +804,7 @@ public: FetchKeysHistograms() : latency(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, FETCH_KEYS_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), bytes(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, FETCH_KEYS_BYTES_HISTOGRAM, Histogram::Unit::bytes)), @@ -1369,31 +1368,31 @@ public: Reference const> const& db, StorageServerInterface const& ssi, Reference encryptionKeyProvider) - : tenantPrefixIndex(makeReference()), encryptionKeyProvider(encryptionKeyProvider), - shardAware(false), tlogCursorReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, - TLOG_CURSOR_READS_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + : encryptionKeyProvider(encryptionKeyProvider), shardAware(false), + tlogCursorReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, + TLOG_CURSOR_READS_LATENCY_HISTOGRAM, + Histogram::Unit::milliseconds)), ssVersionLockLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, SS_VERSION_LOCK_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), eagerReadsLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, EAGER_READS_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), fetchKeysPTreeUpdatesLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, FETCH_KEYS_PTREE_UPDATES_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), tLogMsgsPTreeUpdatesLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), storageUpdatesDurableLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), storageCommitLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, STORAGE_COMMIT_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), ssDurableVersionUpdateLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM, - Histogram::Unit::microseconds)), + Histogram::Unit::milliseconds)), readRangeBytesReturnedHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, SS_READ_RANGE_BYTES_RETURNED_HISTOGRAM, Histogram::Unit::bytes)), @@ -5111,7 +5110,7 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe throw tenant_name_required(); } - if (rangeIntersectsAnyTenant(*(data->tenantPrefixIndex), KeyRangeRef(begin, end), req.version)) { + if (rangeIntersectsAnyTenant(data->tenantPrefixIndex, KeyRangeRef(begin, end), req.version)) { throw tenant_name_required(); } } @@ -8616,11 +8615,11 @@ private: bool StorageServer::insertTenant(TenantNameRef tenantName, TenantMapEntry tenantEntry, Version version) { if (version >= tenantMap.getLatestVersion()) { tenantMap.createNewVersion(version); - tenantPrefixIndex->createNewVersion(version); + tenantPrefixIndex.createNewVersion(version); tenantMap.insert(tenantName, tenantEntry); - auto view = tenantPrefixIndex->at(version); + auto view = tenantPrefixIndex.at(version); auto itr = view.find(tenantEntry.prefix); TenantNameUniqueSet nameSet; if (itr != view.end()) { @@ -8628,7 +8627,7 @@ bool StorageServer::insertTenant(TenantNameRef tenantName, TenantMapEntry tenant } nameSet.insert(tenantName); - tenantPrefixIndex->insert(tenantEntry.prefix, nameSet); + tenantPrefixIndex.insert(tenantEntry.prefix, nameSet); TraceEvent("InsertTenant", thisServerID).detail("Tenant", tenantName).detail("Version", version); return true; @@ -8648,20 +8647,20 @@ void StorageServer::insertTenant(TenantNameRef tenantName, ValueRef value, Versi void StorageServer::clearTenants(TenantNameRef startTenant, TenantNameRef endTenant, Version version) { if (version >= tenantMap.getLatestVersion()) { tenantMap.createNewVersion(version); - tenantPrefixIndex->createNewVersion(version); + tenantPrefixIndex.createNewVersion(version); auto view = tenantMap.at(version); for (auto itr = view.lower_bound(startTenant); itr != view.lower_bound(endTenant); ++itr) { - auto indexView = tenantPrefixIndex->at(version); + auto indexView = tenantPrefixIndex.at(version); // Trigger any watches on the prefix associated with the tenant. watches.triggerRange(itr->prefix, strinc(itr->prefix)); auto indexItr = indexView.find(itr->prefix); ASSERT(indexItr != indexView.end()); TenantNameUniqueSet nameSet = *indexItr; if (nameSet.remove(itr.key())) { - tenantPrefixIndex->erase(itr->prefix); + tenantPrefixIndex.erase(itr->prefix); } else { - tenantPrefixIndex->insert(itr->prefix, nameSet); + tenantPrefixIndex.insert(itr->prefix, nameSet); } TraceEvent("EraseTenant", thisServerID).detail("Tenant", itr.key()).detail("Version", version); } @@ -9348,7 +9347,7 @@ ACTOR Future updateStorage(StorageServer* data) { newOldestVersion, desiredVersion, bytesLeft, unlimitedCommitBytes); if (data->tenantMap.getLatestVersion() < newOldestVersion) { data->tenantMap.createNewVersion(newOldestVersion); - data->tenantPrefixIndex->createNewVersion(newOldestVersion); + data->tenantPrefixIndex.createNewVersion(newOldestVersion); } // We want to forget things from these data structures atomically with changing oldestVersion (and "before", // since oldestVersion.set() may trigger waiting actors) forgetVersionsBeforeAsync visibly forgets @@ -9356,7 +9355,7 @@ ACTOR Future updateStorage(StorageServer* data) { Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage) && data->tenantMap.forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage) && - data->tenantPrefixIndex->forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage); + data->tenantPrefixIndex.forgetVersionsBeforeAsync(newOldestVersion, TaskPriority::UpdateStorage); data->oldestVersion.set(newOldestVersion); wait(finishedForgetting); wait(yield(TaskPriority::UpdateStorage)); @@ -9468,7 +9467,7 @@ ACTOR Future updateStorage(StorageServer* data) { durableDelay = delay(SERVER_KNOBS->STORAGE_COMMIT_INTERVAL, TaskPriority::UpdateStorage); } - wait(ioTimeoutError(durable, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME)); + wait(ioTimeoutError(durable, SERVER_KNOBS->MAX_STORAGE_COMMIT_TIME, "StorageCommit")); data->storageCommitLatencyHistogram->sampleSeconds(now() - beforeStorageCommit); debug_advanceMinCommittedVersion(data->thisServerID, data->storageMinRecoverVersion); @@ -10165,7 +10164,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor data->tenantMap.insert(tenantName, tenantEntry); - auto view = data->tenantPrefixIndex->at(version); + auto view = data->tenantPrefixIndex.at(version); auto itr = view.find(tenantEntry.prefix); TenantNameUniqueSet nameSet; if (itr != view.end()) { @@ -10173,7 +10172,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor } nameSet.insert(tenantName); - data->tenantPrefixIndex->insert(tenantEntry.prefix, nameSet); + data->tenantPrefixIndex.insert(tenantEntry.prefix, nameSet); TraceEvent("RestoringTenant", data->thisServerID) .detail("Key", tenantMap[tenantMapLoc].key) @@ -11275,7 +11274,6 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, self.tag = seedTag; } - self.encryptionKeyProvider->setTenantPrefixIndex(self.tenantPrefixIndex); self.storage.makeNewStorageServerDurable(self.shardAware); wait(self.storage.commit()); ++self.counters.kvCommits; @@ -11358,13 +11356,6 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, recovered.send(Void()); return Void(); } - // Pass a reference of tenantPrefixIndex to the storage engine to support per-tenant data encryption, - // after the tenant map is recovered in restoreDurableState. In case of a storage server reboot, - // it is possible that the storage engine is still holding a pre-reboot tenantPrefixIndex, and use that - // for its own recovery, before we set the tenantPrefixIndex here. - if (self.encryptionKeyProvider.isValid()) { - self.encryptionKeyProvider->setTenantPrefixIndex(self.tenantPrefixIndex); - } TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start); // if this is a tss storage file, use that as source of truth for this server being a tss instead of the diff --git a/fdbserver/workloads/AuthzSecurity.actor.cpp b/fdbserver/workloads/AuthzSecurity.actor.cpp new file mode 100644 index 0000000000..2443ae309f --- /dev/null +++ b/fdbserver/workloads/AuthzSecurity.actor.cpp @@ -0,0 +1,419 @@ +/* + * AuthzSecurity.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "flow/Arena.h" +#include "flow/IRandom.h" +#include "flow/Trace.h" +#include "flow/serialize.h" +#include "fdbrpc/simulator.h" +#include "fdbclient/CommitTransaction.h" +#include "fdbclient/FDBOptions.g.h" +#include "fdbserver/LogSystemConfig.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbserver/TesterInterface.actor.h" +#include "fdbserver/TLogInterface.h" +#include "fdbserver/workloads/workloads.actor.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +FDB_BOOLEAN_PARAM(PositiveTestcase); + +struct AuthzSecurityWorkload : TestWorkload { + static constexpr auto NAME = "AuthzSecurity"; + int actorCount; + double testDuration, transactionsPerSecond; + + std::vector> clients; + Arena arena; + TenantName tenant; + TenantName anotherTenant; + Standalone signedToken; + Standalone signedTokenAnotherTenant; + Standalone tLogConfigKey; + PerfIntCounter crossTenantGetPositive, crossTenantGetNegative, crossTenantCommitPositive, crossTenantCommitNegative, + publicNonTenantRequestPositive, tLogReadNegative; + std::vector(Database cx)>> testFunctions; + + AuthzSecurityWorkload(WorkloadContext const& wcx) + : TestWorkload(wcx), crossTenantGetPositive("CrossTenantGetPositive"), + crossTenantGetNegative("CrossTenantGetNegative"), crossTenantCommitPositive("CrossTenantCommitPositive"), + crossTenantCommitNegative("CrossTenantCommitNegative"), + publicNonTenantRequestPositive("PublicNonTenantRequestPositive"), tLogReadNegative("TLogReadNegative") { + testDuration = getOption(options, "testDuration"_sr, 10.0); + transactionsPerSecond = getOption(options, "transactionsPerSecond"_sr, 500.0) / clientCount; + actorCount = getOption(options, "actorsPerClient"_sr, transactionsPerSecond / 5); + tenant = getOption(options, "tenantA"_sr, "authzSecurityTestTenant"_sr); + anotherTenant = getOption(options, "tenantB"_sr, "authzSecurityTestTenant"_sr); + tLogConfigKey = getOption(options, "tLogConfigKey"_sr, "TLogInterface"_sr); + ASSERT(g_network->isSimulated()); + // make it comfortably longer than the timeout of the workload + signedToken = g_simulator->makeToken( + tenant, uint64_t(std::lround(getCheckTimeout())) + uint64_t(std::lround(testDuration)) + 100); + signedTokenAnotherTenant = g_simulator->makeToken( + anotherTenant, uint64_t(std::lround(getCheckTimeout())) + uint64_t(std::lround(testDuration)) + 100); + testFunctions.push_back( + [this](Database cx) { return testCrossTenantGetDisallowed(this, cx, PositiveTestcase::True); }); + testFunctions.push_back( + [this](Database cx) { return testCrossTenantGetDisallowed(this, cx, PositiveTestcase::False); }); + testFunctions.push_back( + [this](Database cx) { return testCrossTenantCommitDisallowed(this, cx, PositiveTestcase::True); }); + testFunctions.push_back( + [this](Database cx) { return testCrossTenantCommitDisallowed(this, cx, PositiveTestcase::False); }); + testFunctions.push_back( + [this](Database cx) { return testPublicNonTenantRequestsAllowedWithoutTokens(this, cx); }); + testFunctions.push_back([this](Database cx) { return testTLogReadDisallowed(this, cx); }); + } + + Future setup(Database const& cx) override { return Void(); } + + Future start(Database const& cx) override { + for (int c = 0; c < actorCount; c++) + clients.push_back(timeout(runTestClient(this, cx->clone()), testDuration, Void())); + return waitForAll(clients); + } + + Future check(Database const& cx) override { + int errors = 0; + for (int c = 0; c < clients.size(); c++) + errors += clients[c].isError(); + if (errors) + TraceEvent(SevError, "TestFailure").detail("Reason", "There were client errors."); + clients.clear(); + return errors == 0 && crossTenantGetPositive.getValue() > 0 && crossTenantGetNegative.getValue() > 0 && + crossTenantCommitPositive.getValue() > 0 && crossTenantCommitNegative.getValue() > 0 && + publicNonTenantRequestPositive.getValue() > 0 && tLogReadNegative.getValue() > 0; + } + + void getMetrics(std::vector& m) override { + m.push_back(crossTenantGetPositive.getMetric()); + m.push_back(crossTenantGetNegative.getMetric()); + m.push_back(crossTenantCommitPositive.getMetric()); + m.push_back(crossTenantCommitNegative.getMetric()); + m.push_back(publicNonTenantRequestPositive.getMetric()); + m.push_back(tLogReadNegative.getMetric()); + } + + void setAuthToken(Transaction& tr, Standalone token) { + tr.setOption(FDBTransactionOptions::AUTHORIZATION_TOKEN, token); + } + + ACTOR static Future setAndCommitKeyValueAndGetVersion(AuthzSecurityWorkload* self, + Database cx, + TenantName tenant, + Standalone token, + StringRef key, + StringRef value) { + state Transaction tr(cx, tenant); + self->setAuthToken(tr, token); + loop { + try { + tr.set(key, value); + wait(tr.commit()); + return tr.getCommittedVersion(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + ACTOR static Future refreshAndGetCachedLocation(AuthzSecurityWorkload* self, + Database cx, + TenantName tenant, + Standalone token, + StringRef key) { + state Transaction tr(cx, tenant); + self->setAuthToken(tr, token); + loop { + try { + // trigger GetKeyServerLocationsRequest and subsequent cache update + Optional value = wait(tr.get(key)); + (void)value; + auto loc = cx->getCachedLocation(tenant, key); + if (loc.present()) { + return loc.get(); + } else { + wait(delay(0.1)); + } + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + Standalone randomString() { + auto const minLength = tLogConfigKey.size() + 1; + return StringRef( + deterministicRandom()->randomAlphaNumeric(deterministicRandom()->randomInt(minLength, minLength + 100))); + } + + ACTOR static Future> tryGetValue(AuthzSecurityWorkload* self, + TenantName tenant, + Version committedVersion, + Standalone key, + Optional> expectedValue, + Standalone token, + Database cx, + KeyRangeLocationInfo loc) { + loop { + GetValueRequest req; + req.key = key; + req.version = committedVersion; + req.tenantInfo.tenantId = loc.tenantEntry.id; + req.tenantInfo.name = tenant; + req.tenantInfo.token = token; + try { + GetValueReply reply = wait(loadBalance(loc.locations->locations(), + &StorageServerInterface::getValue, + req, + TaskPriority::DefaultPromiseEndpoint, + AtMostOnce::False, + nullptr)); + // test may fail before here, but if it does, the value should match + if (reply.value != expectedValue) { + TraceEvent(SevError, "AuthzSecurityUnmatchedValue") + .detail("Expected", expectedValue) + .detail("Actual", reply.value) + .log(); + } + break; + } catch (Error& e) { + CODE_PROBE(e.code() == error_code_permission_denied, "Cross tenant get meets permission_denied"); + return e; + } + } + return Optional(); + } + + ACTOR static Future testCrossTenantGetDisallowed(AuthzSecurityWorkload* self, + Database cx, + PositiveTestcase positive) { + state Key key = self->randomString(); + state Value value = self->randomString(); + state Version committedVersion = + wait(setAndCommitKeyValueAndGetVersion(self, cx, self->tenant, self->signedToken, key, value)); + // refresh key location cache via get() + KeyRangeLocationInfo loc = wait(refreshAndGetCachedLocation(self, cx, self->tenant, self->signedToken, key)); + if (positive) { + // Supposed to succeed. Expected to occasionally fail because of buggify, faultInjection, or data + // distribution, but should not return permission_denied + Optional outcome = wait(tryGetValue(self, + self->tenant, + committedVersion, + key, + value, + self->signedToken /* passing correct token */, + cx, + loc)); + if (!outcome.present()) { + ++self->crossTenantGetPositive; + } else if (outcome.get().code() == error_code_permission_denied) { + TraceEvent(SevError, "AuthzSecurityError") + .detail("Case", "CrossTenantGetDisallowed") + .detail("Subcase", "Positive") + .log(); + } + } else { + Optional outcome = + wait(tryGetValue(self, + self->tenant, + committedVersion, + key, + value, + self->signedTokenAnotherTenant /* deliberately passing bad token */, + cx, + loc)); + // Should always fail. Expected to return permission_denied, but expected to occasionally fail with + // different errors + if (!outcome.present()) { + TraceEvent(SevError, "AuthzSecurityError") + .detail("Case", "CrossTenantGetDisallowed") + .detail("Subcase", "Negative") + .log(); + } else if (outcome.get().code() == error_code_permission_denied) { + ++self->crossTenantGetNegative; + } + } + return Void(); + } + + ACTOR static Future> tryCommit(AuthzSecurityWorkload* self, + TenantName tenant, + Standalone token, + Key key, + Value newValue, + Version readVersion, + Database cx, + KeyRangeLocationInfo loc) { + loop { + auto const& tenantEntry = loc.tenantEntry; + ASSERT(!tenantEntry.prefix.empty()); + state Key prefixedKey = key.withPrefix(tenantEntry.prefix); + CommitTransactionRequest req; + req.transaction.mutations.push_back(req.arena, MutationRef(MutationRef::SetValue, prefixedKey, newValue)); + req.transaction.read_snapshot = readVersion; + req.tenantInfo.name = tenant; + req.tenantInfo.token = token; + req.tenantInfo.tenantId = tenantEntry.id; + try { + CommitID reply = wait(basicLoadBalance(cx->getCommitProxies(UseProvisionalProxies::False), + &CommitProxyInterface::commit, + req, + TaskPriority::DefaultPromiseEndpoint, + AtMostOnce::False)); + return Optional(); + } catch (Error& e) { + CODE_PROBE(e.code() == error_code_permission_denied, "Cross tenant commit meets permission_denied"); + return e; + } + } + } + + ACTOR static Future testCrossTenantCommitDisallowed(AuthzSecurityWorkload* self, + Database cx, + PositiveTestcase positive) { + state Key key = self->randomString(); + state Value value = self->randomString(); + state Value newValue = self->randomString(); + state Version committedVersion = + wait(setAndCommitKeyValueAndGetVersion(self, cx, self->tenant, self->signedToken, key, value)); + // refresh key location cache to extract tenant prefix + KeyRangeLocationInfo loc = wait(refreshAndGetCachedLocation(self, cx, self->tenant, self->signedToken, key)); + if (positive) { + // Expected to succeed, may occasionally fail + Optional outcome = + wait(tryCommit(self, self->tenant, self->signedToken, key, newValue, committedVersion, cx, loc)); + if (!outcome.present()) { + ++self->crossTenantCommitPositive; + } else if (outcome.get().code() == error_code_permission_denied) { + TraceEvent(SevError, "AuthzSecurityError") + .detail("Case", "CrossTenantGetDisallowed") + .detail("Subcase", "Positive") + .log(); + } + } else { + Optional outcome = wait(tryCommit( + self, self->tenant, self->signedTokenAnotherTenant, key, newValue, committedVersion, cx, loc)); + if (!outcome.present()) { + TraceEvent(SevError, "AuthzSecurityError") + .detail("Case", "CrossTenantGetDisallowed") + .detail("Subcase", "Negative") + .log(); + } else if (outcome.get().code() == error_code_permission_denied) { + ++self->crossTenantCommitNegative; + } + } + return Void(); + } + + ACTOR static Future testPublicNonTenantRequestsAllowedWithoutTokens(AuthzSecurityWorkload* self, + Database cx) { + state Transaction tr(cx, self->tenant); + loop { + try { + Version version = wait(tr.getReadVersion()); + (void)version; + ++self->publicNonTenantRequestPositive; + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + ACTOR static Future testTLogReadDisallowed(AuthzSecurityWorkload* self, Database cx) { + state Key key = self->randomString(); + state Value value = self->randomString(); + state Version committedVersion = + wait(setAndCommitKeyValueAndGetVersion(self, cx, self->tenant, self->signedToken, key, value)); + state Transaction tr(cx, self->tenant); + self->setAuthToken(tr, self->signedToken); + state Optional tLogConfigString; + loop { + try { + Optional value = wait(tr.get(self->tLogConfigKey)); + ASSERT(value.present()); + tLogConfigString = value; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + ASSERT(tLogConfigString.present()); + state LogSystemConfig logSystemConfig = + ObjectReader::fromStringRef(tLogConfigString.get(), IncludeVersion()); + state std::vector logs = logSystemConfig.allPresentLogs(); + state std::vector>> replies; + for (const auto& log : logs) { + replies.push_back(log.peekMessages.tryGetReply( + TLogPeekRequest(committedVersion, Tag(0, committedVersion), false, false))); + } + wait(waitForAllReady(replies)); + for (auto i = 0u; i < logs.size(); i++) { + const auto& reply = replies[i]; + ASSERT(reply.isValid()); + if (reply.canGet()) { + ErrorOr r = reply.getValue(); + if (!r.isError()) { + const TLogPeekReply& rpcReply = r.get(); + TraceEvent(SevError, "AuthzExpectedErrorNotFound") + .detail("TLogIndex", i) + .detail("Messages", rpcReply.messages.toString()) + .detail("End", rpcReply.end) + .detail("Popped", rpcReply.popped) + .detail("MaxKnownVersion", rpcReply.maxKnownVersion) + .detail("MinKnownCommitVersion", rpcReply.minKnownCommittedVersion) + .detail("Begin", rpcReply.begin) + .detail("OnlySpilled", rpcReply.onlySpilled) + .log(); + } else { + Error e = r.getError(); + if (e.code() == error_code_unauthorized_attempt) { + ++self->tLogReadNegative; + } else if (e.code() != error_code_actor_cancelled && + e.code() != error_code_request_maybe_delivered) { + TraceEvent(SevError, "AuthzSecurityUnexpectedError").detail("Error", e.name()).log(); + } + } + } else { + TraceEvent(SevError, "AuthzSecurityUnexpectedError").detail("Error", reply.getError().name()).log(); + } + } + return Void(); + } + + ACTOR static Future runTestClient(AuthzSecurityWorkload* self, Database cx) { + state double lastTime = now(); + state double delay = self->actorCount / self->transactionsPerSecond; + try { + loop { + wait(poisson(&lastTime, delay)); + wait(deterministicRandom()->randomChoice(self->testFunctions)(cx)); + } + } catch (Error& e) { + TraceEvent(SevError, "AuthzSecurityClient").error(e); + throw; + } + } +}; + +WorkloadFactory AuthzSecurityWorkloadFactory(UntrustedMode::True); diff --git a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp index 7bb58b77b2..67e5e6558e 100644 --- a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp +++ b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp @@ -651,13 +651,43 @@ struct BlobGranuleRangesWorkload : TestWorkload { return Void(); } + ACTOR Future adjacentPurge(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) { + // Create 2 adjacent blobbified regions. + Key midKey = range.begin.withSuffix("mid"_sr); + state KeyRange range1(KeyRangeRef(range.begin, midKey)); + state KeyRange range2(KeyRangeRef(midKey, range.end)); + + state bool setSuccess = false; + wait(store(setSuccess, cx->blobbifyRange(range1, self->tenantName))); + ASSERT(setSuccess); + wait(self->checkRange(cx, self, range1, true)); + wait(store(setSuccess, cx->blobbifyRange(range2, self->tenantName))); + ASSERT(setSuccess); + wait(self->checkRange(cx, self, range2, true)); + + // force purge range + state Key purgeKey; + wait(store(purgeKey, self->versionedForcePurge(cx, range1, self->tenantName))); + wait(cx->waitPurgeGranulesComplete(purgeKey)); + wait(store(purgeKey, self->versionedForcePurge(cx, range2, self->tenantName))); + wait(cx->waitPurgeGranulesComplete(purgeKey)); + + bool unsetSuccess = wait(cx->unblobbifyRange(range, self->tenantName)); + ASSERT(unsetSuccess); + + wait(self->tearDownRangeAfterUnit(cx, self, range)); + + return Void(); + } + enum UnitTestTypes { VERIFY_RANGE_UNIT, VERIFY_RANGE_GAP_UNIT, RANGES_MISALIGNED, BLOBBIFY_IDEMPOTENT, RE_BLOBBIFY, - OP_COUNT = 5 /* keep this last */ + ADJACENT_PURGE, + OP_COUNT = 6 /* keep this last */ }; ACTOR Future blobGranuleRangesUnitTests(Database cx, BlobGranuleRangesWorkload* self) { @@ -699,6 +729,8 @@ struct BlobGranuleRangesWorkload : TestWorkload { wait(self->blobbifyIdempotentUnit(cx, self, range)); } else if (op == RE_BLOBBIFY) { wait(self->reBlobbifyUnit(cx, self, range)); + } else if (op == ADJACENT_PURGE) { + wait(self->adjacentPurge(cx, self, range)); } else { ASSERT(false); } diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp index 21a93436a8..fbee454a9d 100644 --- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp +++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp @@ -291,6 +291,7 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload { wait(runRYWTransaction(cx, [=](Reference tr) -> Future { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::RAW_ACCESS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Tuple rate = Tuple::makeTuple(sampleProbability); Tuple size = Tuple::makeTuple(sizeLimit); diff --git a/fdbserver/workloads/ClientWorkload.actor.cpp b/fdbserver/workloads/ClientWorkload.actor.cpp index 41ce0d745f..e5474734f8 100644 --- a/fdbserver/workloads/ClientWorkload.actor.cpp +++ b/fdbserver/workloads/ClientWorkload.actor.cpp @@ -168,7 +168,7 @@ struct WorkloadProcess { WorkloadProcess(ClientWorkload::CreateWorkload const& childCreator, WorkloadContext const& wcx) : processState(WorkloadProcessState::instance(wcx.clientId)) { - TraceEvent("StartingClinetWorkload", id).detail("OnClientProcess", processState->id); + TraceEvent("StartingClientWorkload", id).detail("OnClientProcess", processState->id); childWorkloadContext.clientCount = wcx.clientCount; childWorkloadContext.clientId = wcx.clientId; childWorkloadContext.ccr = wcx.ccr; diff --git a/fdbserver/workloads/DataLossRecovery.actor.cpp b/fdbserver/workloads/DataLossRecovery.actor.cpp index 7f0073ab72..7166269780 100644 --- a/fdbserver/workloads/DataLossRecovery.actor.cpp +++ b/fdbserver/workloads/DataLossRecovery.actor.cpp @@ -25,6 +25,7 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/MoveKeys.actor.h" #include "fdbserver/QuietDatabase.h" +#include "fdbserver/Knobs.h" #include "fdbrpc/simulator.h" #include "fdbserver/workloads/workloads.actor.h" #include "flow/Error.h" @@ -215,19 +216,35 @@ struct DataLossRecoveryWorkload : TestWorkload { moveKeysLock.myOwner = owner; TraceEvent("DataLossRecovery").detail("Phase", "StartMoveKeys"); - wait(moveKeys(cx, - MoveKeysParams{ deterministicRandom()->randomUniqueID(), - keys, - dest, - dest, - moveKeysLock, - Promise(), - &self->startMoveKeysParallelismLock, - &self->finishMoveKeysParallelismLock, - false, - UID(), // for logging only - &ddEnabledState, - CancelConflictingDataMoves::True })); + std::unique_ptr params; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + params = std::make_unique(deterministicRandom()->randomUniqueID(), + std::vector{ keys }, + dest, + dest, + moveKeysLock, + Promise(), + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + false, + UID(), // for logging only + &ddEnabledState, + CancelConflictingDataMoves::True); + } else { + params = std::make_unique(deterministicRandom()->randomUniqueID(), + keys, + dest, + dest, + moveKeysLock, + Promise(), + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + false, + UID(), // for logging only + &ddEnabledState, + CancelConflictingDataMoves::True); + } + wait(moveKeys(cx, *params)); break; } catch (Error& e) { TraceEvent("DataLossRecovery").error(e).detail("Phase", "MoveRangeError"); diff --git a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp index 80248b09b4..96d233f098 100644 --- a/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp +++ b/fdbserver/workloads/IDDTxnProcessorApiCorrectness.actor.cpp @@ -25,6 +25,7 @@ #include "fdbserver/MoveKeys.actor.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" #include "fdbclient/VersionedMap.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -262,9 +263,20 @@ struct IDDTxnProcessorApiWorkload : TestWorkload { } void verifyServerKeyDest(MoveKeysParams& params) const { + KeyRangeRef keys; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + ASSERT(params.ranges.present()); + // TODO: make startMoveShards work with multiple ranges. + ASSERT(params.ranges.get().size() == 1); + keys = params.ranges.get().at(0); + } else { + ASSERT(params.keys.present()); + keys = params.keys.get(); + } + // check destination servers for (auto& id : params.destinationTeam) { - ASSERT(mgs->serverIsDestForShard(id, params.keys)); + ASSERT(mgs->serverIsDestForShard(id, keys)); } } ACTOR static Future testRawMovementApi(IDDTxnProcessorApiWorkload* self) { @@ -332,18 +344,33 @@ struct IDDTxnProcessorApiWorkload : TestWorkload { KeyRange keys = self->getRandomKeys(); std::vector destTeam = self->getRandomTeam(); std::sort(destTeam.begin(), destTeam.end()); - return MoveKeysParams{ deterministicRandom()->randomUniqueID(), - keys, - destTeam, - destTeam, - lock, - Promise(), - nullptr, - nullptr, - false, - UID(), - self->ddContext.ddEnabledState.get(), - CancelConflictingDataMoves::True }; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + return MoveKeysParams(deterministicRandom()->randomUniqueID(), + std::vector{ keys }, + destTeam, + destTeam, + lock, + Promise(), + nullptr, + nullptr, + false, + UID(), + self->ddContext.ddEnabledState.get(), + CancelConflictingDataMoves::True); + } else { + return MoveKeysParams(deterministicRandom()->randomUniqueID(), + keys, + destTeam, + destTeam, + lock, + Promise(), + nullptr, + nullptr, + false, + UID(), + self->ddContext.ddEnabledState.get(), + CancelConflictingDataMoves::True); + } } ACTOR static Future testMoveKeys(IDDTxnProcessorApiWorkload* self) { diff --git a/fdbserver/workloads/LeakTLogInterface.actor.cpp b/fdbserver/workloads/LeakTLogInterface.actor.cpp new file mode 100644 index 0000000000..aa4f0a0782 --- /dev/null +++ b/fdbserver/workloads/LeakTLogInterface.actor.cpp @@ -0,0 +1,72 @@ +/* + * LeakTLogInterface.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "fdbclient/TenantManagement.actor.h" +#include "fdbserver/ServerDBInfo.actor.h" +#include "fdbserver/workloads/workloads.actor.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +struct LeakTLogInterfaceWorkload : TestWorkload { + static constexpr auto NAME = "LeakTLogInterface"; + TenantName tenant; + Standalone fieldName; + double testDuration; + + LeakTLogInterfaceWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + tenant = getOption(options, "tenant"_sr, "DefaultTenant"_sr); + fieldName = getOption(options, "key"_sr, "TLogInterface"_sr); + testDuration = getOption(options, "testDuration"_sr, 10.0); + } + + Future setup(Database const& cx) override { return persistSerializedTLogInterface(this, cx); } + + Future start(Database const& cx) override { return timeout(updateLoop(this, cx), testDuration, Void()); } + Future check(Database const& cx) override { return true; } + virtual void getMetrics(std::vector& m) override {} + + ACTOR static Future persistSerializedTLogInterface(LeakTLogInterfaceWorkload* self, Database cx) { + state Transaction tr(cx, self->tenant); + loop { + ObjectWriter writer(IncludeVersion()); + writer.serialize(self->dbInfo->get().logSystemConfig); + state Standalone logSystemString = writer.toString(); + try { + tr.set(self->fieldName, logSystemString); + wait(tr.commit()); + TraceEvent("LeakTLogInterface").detail("BytesWritten", logSystemString.size()).log(); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + ACTOR static Future updateLoop(LeakTLogInterfaceWorkload* self, Database cx) { + loop { + wait(self->dbInfo->onChange()); + wait(persistSerializedTLogInterface(self, cx)); + } + } +}; + +WorkloadFactory LeakTLogInterfaceWorkload; diff --git a/fdbserver/workloads/PhysicalShardMove.actor.cpp b/fdbserver/workloads/PhysicalShardMove.actor.cpp index cf78d89923..2b7009ddd9 100644 --- a/fdbserver/workloads/PhysicalShardMove.actor.cpp +++ b/fdbserver/workloads/PhysicalShardMove.actor.cpp @@ -344,17 +344,18 @@ struct PhysicalShardMoveWorkLoad : TestWorkload { TraceEvent("TestMoveShardStartMoveKeys").detail("DataMove", dataMoveId); wait(moveKeys(cx, - MoveKeysParams{ dataMoveId, - keys, - dests, - dests, - moveKeysLock, - Promise(), - &self->startMoveKeysParallelismLock, - &self->finishMoveKeysParallelismLock, - false, - deterministicRandom()->randomUniqueID(), // for logging only - &ddEnabledState })); + MoveKeysParams(dataMoveId, + std::vector{ keys }, + dests, + dests, + moveKeysLock, + Promise(), + &self->startMoveKeysParallelismLock, + &self->finishMoveKeysParallelismLock, + false, + deterministicRandom()->randomUniqueID(), // for logging only + &ddEnabledState, + CancelConflictingDataMoves::False))); break; } catch (Error& e) { if (e.code() == error_code_movekeys_conflict) { diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp index 2ee6c2369f..aaf9450490 100644 --- a/fdbserver/workloads/RandomMoveKeys.actor.cpp +++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp @@ -25,6 +25,7 @@ #include "fdbserver/MoveKeys.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/QuietDatabase.h" #include "flow/DeterministicRandom.h" @@ -155,19 +156,35 @@ struct MoveKeysWorkload : FailureInjectionWorkload { try { state Promise signal; state DDEnabledState ddEnabledState; - wait(moveKeys(cx, - MoveKeysParams{ deterministicRandom()->randomUniqueID(), - keys, - destinationTeamIDs, - destinationTeamIDs, - lock, - signal, - &fl1, - &fl2, - false, - relocateShardInterval.pairID, - &ddEnabledState, - CancelConflictingDataMoves::True })); + std::unique_ptr params; + if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { + params = std::make_unique(deterministicRandom()->randomUniqueID(), + std::vector{ keys }, + destinationTeamIDs, + destinationTeamIDs, + lock, + signal, + &fl1, + &fl2, + false, + relocateShardInterval.pairID, + &ddEnabledState, + CancelConflictingDataMoves::True); + } else { + params = std::make_unique(deterministicRandom()->randomUniqueID(), + keys, + destinationTeamIDs, + destinationTeamIDs, + lock, + signal, + &fl1, + &fl2, + false, + relocateShardInterval.pairID, + &ddEnabledState, + CancelConflictingDataMoves::True); + } + wait(moveKeys(cx, *params)); TraceEvent(relocateShardInterval.end()).detail("Result", "Success"); return Void(); } catch (Error& e) { diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp index f1256e6f5f..52ea9d9a0c 100644 --- a/fdbserver/workloads/SnapTest.actor.cpp +++ b/fdbserver/workloads/SnapTest.actor.cpp @@ -105,8 +105,9 @@ public: // variables UID snapUID; // UID used for snap name std::string restartInfoLocation; // file location to store the snap restore info int maxRetryCntToRetrieveMessage; // number of retires to do trackLatest - bool skipCheck; // disable check if the exec fails + bool skipCheck = false; // disable check if the exec fails int retryLimit; // -1 if no limit + bool snapSucceeded = false; // When taking snapshot, tracks snapshot success public: // ctor & dtor SnapTestWorkload(WorkloadContext const& wcx) @@ -119,7 +120,6 @@ public: // ctor & dtor maxSnapDelay = getOption(options, "maxSnapDelay"_sr, 25.0); testID = getOption(options, "testID"_sr, 0); restartInfoLocation = getOption(options, "restartInfoLocation"_sr, "simfdb/restartInfo.ini"_sr).toString(); - skipCheck = false; retryLimit = getOption(options, "retryLimit"_sr, 5); g_simulator->allowLogSetKills = false; } @@ -137,41 +137,15 @@ public: // workload functions return Void(); } - ACTOR Future _check(Database cx, SnapTestWorkload* self) { - if (self->skipCheck) { - TraceEvent(SevWarnAlways, "SnapCheckIgnored").log(); - return true; - } - state Transaction tr(cx); - // read the key SnapFailedTLog.$UID - loop { - try { - Standalone keyStr = - "\xff/SnapTestFailStatus/"_sr.withSuffix(StringRef(self->snapUID.toString())); - TraceEvent("TestKeyStr").detail("Value", keyStr); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - Optional val = wait(tr.get(keyStr)); - if (val.present()) { - break; - } - // wait for the key to be written out by TLogs - wait(delay(0.1)); - } catch (Error& e) { - wait(tr.onError(e)); - } - } - return true; - } - Future check(Database const& cx) override { - TraceEvent("SnapTestWorkloadCheck").detail("ClientID", clientId); + TraceEvent("SnapTestWorkloadCheck").detail("ClientID", clientId).detail("TestID", testID); if (clientId != 0) { return true; } - if (this->testID != 5 && this->testID != 6) { - return true; + if (testID == 1) { + return snapSucceeded; } - return _check(cx, this); + return true; } void getMetrics(std::vector& m) override { TraceEvent("SnapTestWorkloadGetMetrics"); } @@ -236,10 +210,6 @@ public: // workload functions wait(status); break; } catch (Error& e) { - if (e.code() == error_code_snap_log_anti_quorum_unsupported) { - snapFailed = true; - break; - } TraceEvent("SnapCreateError").error(e); ++retry; // snap v2 can fail for many reasons, so retry for 5 times and then fail it @@ -258,7 +228,9 @@ public: // workload functions ini.SetValue("RESTORE", "BackupFailed", format("%d", snapFailed).c_str()); ini.SaveFile(self->restartInfoLocation.c_str()); // write the snapUID to a file - TraceEvent("SnapshotCreateStatus").detail("Status", !snapFailed ? "Success" : "Failure"); + auto const severity = snapFailed ? SevError : SevInfo; + TraceEvent(severity, "SnapshotCreateStatus").detail("Status", !snapFailed ? "Success" : "Failure"); + self->snapSucceeded = !snapFailed; } else if (self->testID == 2) { // create odd keys after the snapshot wait(self->_create_keys(cx, "snapKey", false /*even*/)); @@ -325,8 +297,7 @@ public: // workload functions wait(status); break; } catch (Error& e) { - if (e.code() == error_code_snap_not_fully_recovered_unsupported || - e.code() == error_code_snap_log_anti_quorum_unsupported) { + if (e.code() == error_code_snap_not_fully_recovered_unsupported) { snapFailed = true; break; } diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index a576bf7360..dc5af8396a 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -70,14 +70,12 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { double getCheckTimeout() const override { return std::numeric_limits::max(); } void disableFailureInjectionWorkloads(std::set& out) const override { - out.insert("RandomMoveKeys"); - - // Rollback interferes with the - // \xff\xff/worker_interfaces test, since it can - // trigger a cluster recvoery, causing the worker - // interface for a machine to be updated in the middle - // of the test. - out.insert("RollbackWorkload"); + // Failure injection workloads like Rollback, Attrition and so on are interfering with the test. + // In particular, the test aims to test special keys' functions on monitoring and managing the cluster. + // It expects the FDB cluster is healthy and not doing unexpected configuration changes. + // All changes should come from special keys' operations' outcome. + // Consequently, we disable all failure injection workloads in backgroud for this test + out.insert("all"); } Future _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) { diff --git a/flow/Arena.cpp b/flow/Arena.cpp index 16908dbefd..884f792227 100644 --- a/flow/Arena.cpp +++ b/flow/Arena.cpp @@ -297,12 +297,13 @@ void* ArenaBlock::make4kAlignedBuffer(uint32_t size) { } void ArenaBlock::dependOn(Reference& self, ArenaBlock* other) { - ASSERT(self->getData() != other->getData()); other->addref(); - if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) + if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) { create(SMALL, self)->makeReference(other); - else + } else { + ASSERT(self->getData() != other->getData()); self->makeReference(other); + } } void* ArenaBlock::dependOn4kAlignedBuffer(Reference& self, uint32_t size) { diff --git a/flow/Histogram.cpp b/flow/Histogram.cpp index 19a4946a23..41cbeaf81b 100644 --- a/flow/Histogram.cpp +++ b/flow/Histogram.cpp @@ -85,7 +85,7 @@ void HistogramRegistry::clear() { #pragma region Histogram -const char* const Histogram::UnitToStringMapper[] = { "microseconds", "bytes", "bytes_per_second", +const char* const Histogram::UnitToStringMapper[] = { "milliseconds", "bytes", "bytes_per_second", "percentage", "count", "none" }; void Histogram::writeToLog(double elapsed) { @@ -111,7 +111,8 @@ void Histogram::writeToLog(double elapsed) { if (buckets[i]) { totalCount += buckets[i]; switch (unit) { - case Unit::microseconds: + case Unit::milliseconds: + // value stored in microseconds, so divide by 1000 before writing e.detail(format("LessThan%u.%03u", int(value / 1000), int(value % 1000)), buckets[i]); break; case Unit::bytes: @@ -227,7 +228,7 @@ TEST_CASE("/flow/histogram/smoke_test") { h = Histogram::getHistogram("smoke_test"_sr, "counts"_sr, Histogram::Unit::bytes); ASSERT(h->buckets[0] == 0); - h = Histogram::getHistogram("smoke_test"_sr, "times"_sr, Histogram::Unit::microseconds); + h = Histogram::getHistogram("smoke_test"_sr, "times"_sr, Histogram::Unit::milliseconds); h->sampleSeconds(0.000000); h->sampleSeconds(0.0000019); diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 7d6132bc6c..ba391f8191 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -86,8 +86,8 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) { init( WRITE_TRACING_ENABLED, true ); if( randomize && BUGGIFY ) WRITE_TRACING_ENABLED = false; init( TRACING_SPAN_ATTRIBUTES_ENABLED, false ); // Additional K/V and tenant data added to Span Attributes - init( TRACING_SAMPLE_RATE, 0.0); // Fraction of distributed traces (not spans) to sample (0 means ignore all traces) - init( TRACING_UDP_LISTENER_ADDR, "127.0.0.1"); // Only applicable if TracerType is set to a network option + init( TRACING_SAMPLE_RATE, 0.0 ); if (randomize && BUGGIFY) TRACING_SAMPLE_RATE = 0.01; // Fraction of distributed traces (not spans) to sample (0 means ignore all traces) + init( TRACING_UDP_LISTENER_ADDR, "127.0.0.1" ); // Only applicable if TracerType is set to a network option init( TRACING_UDP_LISTENER_PORT, 8889 ); // Only applicable if TracerType is set to a network option //connectionMonitor diff --git a/flow/include/flow/Histogram.h b/flow/include/flow/Histogram.h index b4bea7ab66..705e491ecf 100644 --- a/flow/include/flow/Histogram.h +++ b/flow/include/flow/Histogram.h @@ -58,7 +58,7 @@ HistogramRegistry& GetHistogramRegistry(); */ class Histogram final : public ReferenceCounted { public: - enum class Unit { microseconds = 0, bytes, bytes_per_second, percentageLinear, countLinear, MAXHISTOGRAMUNIT }; + enum class Unit { milliseconds = 0, bytes, bytes_per_second, percentageLinear, countLinear, MAXHISTOGRAMUNIT }; static const char* const UnitToStringMapper[]; Histogram(Reference regis, diff --git a/flow/include/flow/Net2Packet.h b/flow/include/flow/Net2Packet.h index d9c10a3bcb..67ae8a82d5 100644 --- a/flow/include/flow/Net2Packet.h +++ b/flow/include/flow/Net2Packet.h @@ -45,7 +45,7 @@ public: UnsentPacketQueue() : unsent_first(0), unsent_last(0), sendQueueLatencyHistogram( - Histogram::getHistogram("UnsentPacketQueue"_sr, "QueueWait"_sr, Histogram::Unit::microseconds)) {} + Histogram::getHistogram("UnsentPacketQueue"_sr, "QueueWait"_sr, Histogram::Unit::milliseconds)) {} ~UnsentPacketQueue() { discardAll(); diff --git a/flow/include/flow/flat_buffers.h b/flow/include/flow/flat_buffers.h index 895c7a2969..e64982ecbf 100644 --- a/flow/include/flow/flat_buffers.h +++ b/flow/include/flow/flat_buffers.h @@ -186,12 +186,28 @@ struct vector_like_traits> : std::true_type { } }; +// std::map and std::set have overloads of insert that take a hint parameter. +// If you start with an empty set, insert sorted data, and use end() as the hint, then inserting n items is O(n) instead +// of the O(n log n) you would get if you used std::inserter. +template +struct InsertHintIterator { + Container* set; + void operator=(const typename Container::value_type& t) { set->insert(set->end(), t); } + InsertHintIterator& operator*() { return *this; } + void operator++() {} +}; + +template +auto insert_hint_iterator(Container& set) { + return InsertHintIterator{ &set }; +} + template struct vector_like_traits> : std::true_type { using Vec = std::map; using value_type = std::pair; using iterator = typename Vec::const_iterator; - using insert_iterator = std::insert_iterator; + using insert_iterator = InsertHintIterator; template static size_t num_entries(const Vec& v, Context&) { @@ -204,7 +220,7 @@ struct vector_like_traits> : std::true_type template static insert_iterator insert(Vec& v, size_t s, Context&) { v.clear(); - return std::inserter(v, v.end()); + return insert_hint_iterator(v); } template @@ -273,7 +289,7 @@ struct vector_like_traits> : std::true_type { using Vec = std::set; using value_type = Key; using iterator = typename Vec::const_iterator; - using insert_iterator = std::insert_iterator; + using insert_iterator = InsertHintIterator; template static size_t num_entries(const Vec& v, Context&) { @@ -286,7 +302,7 @@ struct vector_like_traits> : std::true_type { template static insert_iterator insert(Vec& v, size_t size, Context&) { v.clear(); - return std::inserter(v, v.end()); + return insert_hint_iterator(v); } template diff --git a/flow/include/flow/serialize.h b/flow/include/flow/serialize.h index 4c0deddcc7..e49ccb4cff 100644 --- a/flow/include/flow/serialize.h +++ b/flow/include/flow/serialize.h @@ -256,7 +256,7 @@ inline void load(Archive& ar, std::set& value) { T currentValue; for (int i = 0; i < s; i++) { ar >> currentValue; - value.insert(currentValue); + value.insert(value.end(), currentValue); } ASSERT(ar.protocolVersion().isValid()); } @@ -277,7 +277,7 @@ inline void load(Archive& ar, std::map& value) { for (int i = 0; i < s; ++i) { std::pair p; ar >> p.first >> p.second; - value.emplace(p); + value.emplace_hint(value.end(), p); } ASSERT(ar.protocolVersion().isValid()); } diff --git a/flowbench/BenchSamples.cpp b/flowbench/BenchSamples.cpp index 1ac9e034d2..f2af81117d 100644 --- a/flowbench/BenchSamples.cpp +++ b/flowbench/BenchSamples.cpp @@ -143,7 +143,7 @@ static void bench_histogramPct(benchmark::State& state) { BENCHMARK(bench_histogramPct)->ReportAggregatesOnly(true); static void bench_histogramTime(benchmark::State& state) { - Reference h = Histogram::getHistogram("histogramTest"_sr, "latency"_sr, Histogram::Unit::microseconds); + Reference h = Histogram::getHistogram("histogramTest"_sr, "latency"_sr, Histogram::Unit::milliseconds); InputGenerator data(1e6, []() { return deterministicRandom()->random01() * 5; }); for (auto _ : state) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bbfa9dfd89..66c92408fe 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,7 +10,7 @@ set(TEST_AGGREGATE_TRACES "NONE" CACHE STRING "Create aggregated trace files (NO set(TEST_LOG_FORMAT "xml" CACHE STRING "Format for test trace files (xml, json)") set(TEST_INCLUDE ".*" CACHE STRING "Include only tests that match the given regex") set(TEST_EXCLUDE ".^" CACHE STRING "Exclude all tests matching the given regex") -set(SANITIZER_OPTIONS "UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1;TSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/contrib/tsan.suppressions" CACHE STRING "Environment variables setting sanitizer options") +set(SANITIZER_OPTIONS "UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1;TSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/contrib/tsan.suppressions;LSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/contrib/lsan.suppressions" CACHE STRING "Environment variables setting sanitizer options") # for the restart test we optimally want to use the last stable fdbserver # to test upgrades @@ -34,6 +34,8 @@ if(WITH_PYTHON) Or provide a path to another fdbserver") endif() + configure_file(${PROJECT_SOURCE_DIR}/tests/TestRunner/fdb_version.py.cmake ${PROJECT_BINARY_DIR}/tests/TestRunner/fdb_version.py) + set(TestRunner "${PROJECT_SOURCE_DIR}/tests/TestRunner/TestRunner.py") configure_file(${PROJECT_SOURCE_DIR}/tests/CTestCustom.ctest.cmake ${PROJECT_BINARY_DIR}/CTestCustom.ctest @ONLY) @@ -124,6 +126,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/AtomicBackupToDBCorrectness.toml) add_fdb_test(TEST_FILES fast/AtomicOps.toml) add_fdb_test(TEST_FILES fast/AtomicOpsApiCorrectness.toml) + add_fdb_test(TEST_FILES fast/AuthzSecurity.toml) add_fdb_test(TEST_FILES fast/AutomaticIdempotency.toml) add_fdb_test(TEST_FILES fast/BackupAzureBlobCorrectness.toml IGNORE) add_fdb_test(TEST_FILES fast/BackupS3BlobCorrectness.toml IGNORE) diff --git a/tests/TestRunner/binary_download.py b/tests/TestRunner/binary_download.py index 6f14969a79..750b85a350 100644 --- a/tests/TestRunner/binary_download.py +++ b/tests/TestRunner/binary_download.py @@ -7,12 +7,10 @@ import shutil import stat from urllib import request import hashlib +from fdb_version import CURRENT_VERSION, FUTURE_VERSION from local_cluster import random_secret_string -CURRENT_VERSION = "7.3.0" -FUTURE_VERSION = "7.4.0" - SUPPORTED_PLATFORMS = ["x86_64", "aarch64"] FDB_DOWNLOAD_ROOT = "https://github.com/apple/foundationdb/releases/download/" LOCAL_OLD_BINARY_REPO = "/opt/foundationdb/old/" @@ -110,7 +108,7 @@ class FdbBinaryDownloader: assert local_file_tmp.exists(), "{} does not exist".format(local_file_tmp) assert local_sha256.exists(), "{} does not exist".format(local_sha256) - expected_checksum = read_to_str(local_sha256) + expected_checksum = read_to_str(local_sha256)[0:64] actual_checkum = compute_sha256(local_file_tmp) if expected_checksum == actual_checkum: print("Checksum OK") diff --git a/tests/TestRunner/fdb_version.py.cmake b/tests/TestRunner/fdb_version.py.cmake new file mode 100644 index 0000000000..04883cd797 --- /dev/null +++ b/tests/TestRunner/fdb_version.py.cmake @@ -0,0 +1,5 @@ +CURRENT_VERSION = "${FDB_CURRENT_VERSION}" +FUTURE_VERSION = "${FDB_FUTURE_VERSION}" +PREV_RELEASE_VERSION = "${FDB_PREV_RELEASE_VERSION}" +PREV2_RELEASE_VERSION = "${FDB_PREV2_RELEASE_VERSION}" +PREV3_RELEASE_VERSION = "${FDB_PREV3_RELEASE_VERSION}" diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py index 83d9cc3fc5..dab62aed4d 100755 --- a/tests/TestRunner/upgrade_test.py +++ b/tests/TestRunner/upgrade_test.py @@ -11,7 +11,8 @@ import sys from threading import Thread, Event import traceback import time -from binary_download import FdbBinaryDownloader, CURRENT_VERSION, FUTURE_VERSION +from binary_download import FdbBinaryDownloader +from fdb_version import CURRENT_VERSION, FUTURE_VERSION from local_cluster import LocalCluster, random_secret_string TENANT_API_VERSION = 720 diff --git a/tests/fast/AuthzSecurity.toml b/tests/fast/AuthzSecurity.toml new file mode 100644 index 0000000000..9556751a73 --- /dev/null +++ b/tests/fast/AuthzSecurity.toml @@ -0,0 +1,31 @@ +[configuration] +allowDefaultTenant = false +tenantModes = ['optional', 'required'] + +[[test]] +testTitle = 'TenantCreation' + + [[test.workload]] + testName = 'CreateTenant' + name = 'AuthzSecurityTenant' + + [[test.workload]] + testName = 'CreateTenant' + name = 'AnotherAuthzSecurityTenant' + +[[test]] +testTitle = 'AuthzSecurityCheck' +clearAfterTest = false + + [[test.workload]] + testName = 'LeakTLogInterface' + tenant = 'AuthzSecurityTenant' + key = 'TLogInterface' + testDuration = 10.0 + + [[test.workload]] + testName = 'AuthzSecurity' + tenantA = 'AuthzSecurityTenant' + tenantB = 'AnotherAuthzSecurityTenant' + tLogConfigKey = 'TLogInterface' + testDuration = 10.0 diff --git a/tests/fast/EncryptedBackupCorrectness.toml b/tests/fast/EncryptedBackupCorrectness.toml index 9470eb4460..df5f9839b8 100644 --- a/tests/fast/EncryptedBackupCorrectness.toml +++ b/tests/fast/EncryptedBackupCorrectness.toml @@ -1,5 +1,6 @@ [configuration] tenantModes = ['required'] +encryptModes = ['domain_aware'] [[knobs]] enable_encryption = true diff --git a/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt b/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt index 585d975bed..c98df2ffa2 100644 --- a/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt +++ b/tests/restarting/from_7.1.0/SnapCycleRestart-1.txt @@ -1,4 +1,5 @@ storageEngineExcludeTypes=4,5 +logAntiQuorum=0 ;Take snap and do cycle test testTitle=SnapCyclePre diff --git a/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt b/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt index fa9028e6be..709d62c505 100644 --- a/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt +++ b/tests/restarting/from_7.1.0/SnapTestAttrition-1.txt @@ -1,4 +1,5 @@ storageEngineExcludeTypes=4,5 +logAntiQuorum=0 ;write 1000 Keys ending with even numbers testTitle=SnapTestPre diff --git a/tests/restarting/from_7.1.0/SnapTestRestart-1.txt b/tests/restarting/from_7.1.0/SnapTestRestart-1.txt index 2b90b703dc..3013a0fa0a 100644 --- a/tests/restarting/from_7.1.0/SnapTestRestart-1.txt +++ b/tests/restarting/from_7.1.0/SnapTestRestart-1.txt @@ -1,4 +1,5 @@ storageEngineExcludeTypes=4,5 +logAntiQuorum=0 ;write 1000 Keys ending with even numbers testTitle=SnapTestPre diff --git a/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt b/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt index b322742418..2068ee98ca 100644 --- a/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt +++ b/tests/restarting/from_7.1.0/SnapTestSimpleRestart-1.txt @@ -1,4 +1,5 @@ storageEngineExcludeTypes=4,5 +logAntiQuorum=0 ;write 1000 Keys ending with even number testTitle=SnapSimplePre diff --git a/tests/slow/DifferentClustersSameRV.toml b/tests/slow/DifferentClustersSameRV.toml index 31121b5ccc..373eb36396 100644 --- a/tests/slow/DifferentClustersSameRV.toml +++ b/tests/slow/DifferentClustersSameRV.toml @@ -1,5 +1,7 @@ [configuration] extraDatabaseMode = 'Single' +# Temporarily disable default tenants in this test pending tenant implementation changes +allowDefaultTenant = false [[test]] testTitle = 'DifferentClustersSameRV' diff --git a/tests/slow/DiskFailureCycle.toml b/tests/slow/DiskFailureCycle.toml index 74e1b3a613..bbbb088b44 100644 --- a/tests/slow/DiskFailureCycle.toml +++ b/tests/slow/DiskFailureCycle.toml @@ -3,7 +3,7 @@ buggify = false minimumReplication = 3 minimumRegions = 3 logAntiQuorum = 0 -storageEngineExcludeTypes = [4, 5] +storageEngineExcludeTypes = [1, 2, 4, 5] disableRemoteKVS = true [[test]]