diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..c72f249028 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +ignore = E203, E266, E501, W503, F403, F401, E711 +max-line-length = 79 +max-complexity = 18 +select = B,C,E,F,W,T4,B9 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 57f424d17b..0fe8a7c92a 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ packaging/msi/obj simfdb tests/oldBinaries trace.*.xml +trace.*.json .venv # Editor files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..a843a71aac --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: +- repo: https://github.com/psf/black + rev: 2018e667a6a36ee3fbfa8041cd36512f92f60d49 # frozen: 22.8.0 + hooks: + - id: black +- repo: https://github.com/pycqa/flake8 + rev: f8e1b317742036ff11ff86356fd2b68147e169f7 # frozen: 5.0.4 + hooks: + - id: flake8 \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 5666b1f202..4b631b5df4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,11 @@ else() cmake_minimum_required(VERSION 3.13) endif() +# silence deprecation warnings in newer versions of cmake +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + project(foundationdb VERSION 7.2.0 DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 35845dbb08..f142b3dfae 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -38,6 +38,9 @@ We love pull requests! For minor changes, feel free to open up a PR directly. Fo CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer. Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them. +### Python pre-commit +We use a pre-commit pipeline with black and flake8 to enforce python best coding practices. Install pre-commit ```pip install pre-commit```. Install it in your FoundationDB directory ```pre-commit install```. + ### Reporting issues Please refer to the section below on [using GitHub issues and the community forums](#using-github-issues-and-community-forums) for more info. diff --git a/FDBLibTLS/FDBLibTLSPolicy.cpp b/FDBLibTLS/FDBLibTLSPolicy.cpp index 6f81f91335..d97932659b 100644 --- a/FDBLibTLS/FDBLibTLSPolicy.cpp +++ b/FDBLibTLS/FDBLibTLSPolicy.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/FDBLibTLS/FDBLibTLSVerify.cpp b/FDBLibTLS/FDBLibTLSVerify.cpp index 216966f4c0..4aeea07c15 100644 --- a/FDBLibTLS/FDBLibTLSVerify.cpp +++ b/FDBLibTLS/FDBLibTLSVerify.cpp @@ -28,6 +28,7 @@ #include #include #include +#include static int hexValue(char c) { static char const digits[] = "0123456789ABCDEF"; diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index 508ede8998..a5de827fe9 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -49,6 +49,17 @@ from bindingtester.known_testers import Tester import fdb import fdb.tuple + +API_VERSIONS = [ + 13, 14, 16, 21, 22, 23, + 100, 200, 300, + 400, 410, 420, 430, 440, 450, 460, + 500, 510, 520, + 600, 610, 620, 630, + 700, 710, 720, +] + + fdb.api_version(FDB_API_VERSION) @@ -156,8 +167,7 @@ def choose_api_version(selected_api_version, tester_min_version, tester_max_vers elif random.random() < 0.7: api_version = min_version elif random.random() < 0.9: - api_version = random.choice([v for v in [13, 14, 16, 21, 22, 23, 100, 200, 300, 400, 410, 420, 430, - 440, 450, 460, 500, 510, 520, 600, 610, 620, 630, 700, 710, 720] if v >= min_version and v <= max_version]) + api_version = random.choice([v for v in API_VERSIONS if v >= min_version and v <= max_version]) else: api_version = random.randint(min_version, max_version) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index da8f0b0bac..e0a1fc31bb 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -30,13 +30,13 @@ endif() add_custom_command(OUTPUT ${asm_file} ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${os} ${cpu} - ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp - ${asm_file} - ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h + ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp + ${asm_file} + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp COMMENT "Generate C bindings") add_custom_target(fdb_c_generated DEPENDS ${asm_file} - ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h) + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h) vexillographer_compile(TARGET fdb_c_options LANG c OUT ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h) @@ -66,9 +66,9 @@ if(APPLE) set(symbols ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.symbols) add_custom_command(OUTPUT ${symbols} COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/symbolify.py - ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h - ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h - ${symbols} + ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h + ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h + ${symbols} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symbolify.py ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h COMMENT "Generate exported_symbols_list") add_custom_target(exported_symbols_list DEPENDS ${symbols}) @@ -76,7 +76,7 @@ if(APPLE) target_link_options(fdb_c PRIVATE "LINKER:-no_weak_exports,-exported_symbols_list,${symbols}") elseif(WIN32) else() - if (NOT USE_UBSAN) + if(NOT USE_UBSAN) # For ubsan we need to export type information for the vptr check to work. # Otherwise we only want to export fdb symbols in the fdb c api. target_link_options(fdb_c PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map") @@ -127,9 +127,9 @@ if(NOT WIN32) test/unit/fdb_api.hpp) add_library(fdb_cpp INTERFACE test/fdb_api.hpp) - target_sources(fdb_cpp INTERFACE ) + target_sources(fdb_cpp INTERFACE) target_include_directories(fdb_cpp INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/test) - target_link_libraries(fdb_cpp INTERFACE fmt::fmt) + target_link_libraries(fdb_cpp INTERFACE fdb_c fmt::fmt) set(API_TESTER_SRCS test/apitester/fdb_c_api_tester.cpp @@ -139,6 +139,9 @@ if(NOT WIN32) test/apitester/TesterTestSpec.cpp test/apitester/TesterTestSpec.h test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp + test/apitester/TesterBlobGranuleErrorsWorkload.cpp + test/apitester/TesterBlobGranuleUtil.cpp + test/apitester/TesterBlobGranuleUtil.h test/apitester/TesterCancelTransactionWorkload.cpp test/apitester/TesterCorrectnessWorkload.cpp test/apitester/TesterExampleWorkload.cpp @@ -154,7 +157,7 @@ if(NOT WIN32) test/apitester/TesterWatchAndWaitWorkload.cpp test/apitester/TesterWorkload.cpp test/apitester/TesterWorkload.h - ) + ) add_library(fdb_c_unit_tests_impl OBJECT ${UNIT_TEST_SRCS}) add_library(fdb_c_api_tester_impl OBJECT ${API_TESTER_SRCS}) @@ -196,6 +199,9 @@ if(NOT WIN32) target_include_directories(fdb_c_api_tester_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include ${CMAKE_BINARY_DIR}/flow/include) target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target) + if(NOT APPLE) + target_link_libraries(fdb_c_api_tester_impl PRIVATE stdc++fs) + endif() target_link_libraries(fdb_c_api_tester_impl PRIVATE SimpleOpt) target_include_directories(fdb_c_unit_tests_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/) @@ -222,208 +228,222 @@ if(NOT WIN32) set(FDB_C_TARGET $) else() set(FDB_C_TARGET $) - endif() - add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - COMMAND ${CMAKE_COMMAND} -E copy ${FDB_C_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - DEPENDS fdb_c - COMMENT "Copy libfdb_c to use as external client for test") - add_custom_target(external_client DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so) - add_dependencies(fdb_c_unit_tests_impl external_client) - add_dependencies(disconnected_timeout_unit_tests external_client) - add_dependencies(fdb_c_api_tester_impl external_client) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + COMMAND ${CMAKE_COMMAND} -E copy ${FDB_C_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + DEPENDS fdb_c + COMMENT "Copy libfdb_c to use as external client for test") + add_custom_target(external_client DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so) + add_dependencies(fdb_c_unit_tests_impl external_client) + add_dependencies(disconnected_timeout_unit_tests external_client) + add_dependencies(fdb_c_api_tester_impl external_client) - add_fdbclient_test( - NAME fdb_c_setup_tests - COMMAND $) - add_fdbclient_test( - NAME fdb_c_unit_tests - COMMAND $ - @CLUSTER_FILE@ - fdb) - add_fdbclient_test( - NAME fdb_c_unit_tests_version_510 - COMMAND $ - @CLUSTER_FILE@ - fdb) - add_fdbclient_test( - NAME trace_partial_file_suffix_test - COMMAND $ - @CLUSTER_FILE@ - fdb) - add_fdbclient_test( - NAME fdb_c_external_client_unit_tests - COMMAND $ - @CLUSTER_FILE@ - fdb - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - ) - add_unavailable_fdbclient_test( - NAME disconnected_timeout_unit_tests - COMMAND $ - @CLUSTER_FILE@ - ) - add_unavailable_fdbclient_test( - NAME disconnected_timeout_external_client_unit_tests - COMMAND $ - @CLUSTER_FILE@ - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - ) - add_fdbclient_test( - NAME fdb_c_api_tests - DISABLE_LOG_DUMP - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $ - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - ) + add_fdbclient_test( + NAME fdb_c_setup_tests + COMMAND $) + add_fdbclient_test( + NAME fdb_c_unit_tests + COMMAND $ + @CLUSTER_FILE@ + fdb) + add_fdbclient_test( + NAME fdb_c_unit_tests_version_510 + COMMAND $ + @CLUSTER_FILE@ + fdb) + add_fdbclient_test( + NAME trace_partial_file_suffix_test + COMMAND $ + @CLUSTER_FILE@ + fdb) + add_fdbclient_test( + NAME fdb_c_external_client_unit_tests + COMMAND $ + @CLUSTER_FILE@ + fdb + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + ) + add_unavailable_fdbclient_test( + NAME disconnected_timeout_unit_tests + COMMAND $ + @CLUSTER_FILE@ + ) + add_unavailable_fdbclient_test( + NAME disconnected_timeout_external_client_unit_tests + COMMAND $ + @CLUSTER_FILE@ + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + ) + add_fdbclient_test( + NAME fdb_c_api_tests + DISABLE_LOG_DUMP + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ + ) - add_fdbclient_test( - NAME fdb_c_api_tests_blob_granule - DISABLE_LOG_DUMP - API_TEST_BLOB_GRANULES_ENABLED - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $ - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests - --blob-granule-local-file-path - @DATA_DIR@/fdbblob/ - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - ) + add_fdbclient_test( + NAME fdb_c_api_tests_local_only + DISABLE_LOG_DUMP + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ + ) - add_fdbclient_test( - NAME fdb_c_api_tests_with_tls - DISABLE_LOG_DUMP - TLS_ENABLED - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $ - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - --tls-cert-file - @CLIENT_CERT_FILE@ - --tls-key-file - @CLIENT_KEY_FILE@ - --tls-ca-file - @SERVER_CA_FILE@ - ) + add_fdbclient_test( + NAME fdb_c_api_tests_blob_granule + DISABLE_LOG_DUMP + API_TEST_BLOB_GRANULES_ENABLED + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests + --blob-granule-local-file-path + @DATA_DIR@/fdbblob/ + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ + ) - add_test(NAME fdb_c_upgrade_to_future_version - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + add_fdbclient_test( + NAME fdb_c_api_tests_with_tls + DISABLE_LOG_DUMP + TLS_ENABLED + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ + --tls-cert-file + @CLIENT_CERT_FILE@ + --tls-key-file + @CLIENT_KEY_FILE@ + --tls-ca-file + @SERVER_CA_FILE@ + ) + + add_test(NAME fdb_c_upgrade_to_future_version + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --process-number 3 + ) + set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") + + add_test(NAME fdb_c_upgrade_to_future_version_blob_granules + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml + --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --blob-granules-enabled + --process-number 3 + ) + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER) + add_test(NAME fdb_c_upgrade_single_threaded_630api + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml + --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" + --process-number 1 + ) + + add_test(NAME fdb_c_upgrade_single_threaded_700api + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml + --upgrade-path "7.0.0" "7.1.9" "7.2.0" + --process-number 1 + ) + + add_test(NAME fdb_c_upgrade_multi_threaded_630api + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" "7.1.9" --process-number 3 - ) - set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") + ) -if (0) # reenable after stabilizing the test - add_test(NAME fdb_c_upgrade_to_future_version_blob_granules - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + add_test(NAME fdb_c_upgrade_multi_threaded_700api + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml - --upgrade-path "7.2.0" "7.3.0" "7.2.0" - --blob-granules-enabled + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.0.0" "7.1.9" "7.2.0" "7.1.9" --process-number 3 - ) -endif() + ) - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER) - add_test(NAME fdb_c_upgrade_single_threaded_630api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" - --process-number 1 - ) - - add_test(NAME fdb_c_upgrade_single_threaded_700api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "7.0.0" "7.1.9" "7.2.0" - --process-number 1 - ) - - add_test(NAME fdb_c_upgrade_multi_threaded_630api + add_test(NAME fdb_c_upgrade_multi_threaded_710api COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" "7.1.9" - --process-number 3 - ) + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.1.9" "7.2.0" "7.1.9" + --process-number 3 + ) - add_test(NAME fdb_c_upgrade_multi_threaded_700api + add_test(NAME fdb_c_cluster_wiggle COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.0.0" "7.1.9" "7.2.0" "7.1.9" - --process-number 3 - ) + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.2.0" "wiggle" + --disable-log-dump + --process-number 3 + --redundancy double + ) - add_test(NAME fdb_c_upgrade_multi_threaded_710api + add_test(NAME fdb_c_wiggle_and_upgrade_latest COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.9" "7.2.0" "7.1.9" - --process-number 3 - ) - - add_test(NAME fdb_c_cluster_wiggle - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.2.0" "wiggle" - --disable-log-dump - --process-number 3 - --redundancy double - ) - - add_test(NAME fdb_c_wiggle_and_upgrade_latest - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.9" "wiggle" "7.2.0" - --disable-log-dump - --process-number 3 - --redundancy double + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.1.9" "wiggle" "7.2.0" + --disable-log-dump + --process-number 3 + --redundancy double ) add_test(NAME fdb_c_wiggle_and_upgrade_63 COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.24" "wiggle" "7.0.0" - --disable-log-dump - --process-number 3 - --redundancy double - ) + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "6.3.24" "wiggle" "7.0.0" + --disable-log-dump + --process-number 3 + --redundancy double + ) + endif() endif() endif() @@ -442,12 +462,12 @@ set_target_properties(c_workloads PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb") target_link_libraries(c_workloads PUBLIC fdb_c) -if (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) +if(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) target_link_options(c_workloads PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/external_workload.map,-z,nodelete") endif() # Generate shim library in Linux builds -if (OPEN_FOR_IDE) +if(OPEN_FOR_IDE) add_library(fdb_c_shim OBJECT foundationdb/fdb_c_shim.h fdb_c_shim.cpp) target_link_libraries(fdb_c_shim PUBLIC dl) @@ -479,14 +499,14 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer add_custom_command(OUTPUT ${SHIM_LIB_GEN_SRC} COMMAND $ ${IMPLIBSO_SRC_DIR}/implib-gen.py - --target ${CMAKE_SYSTEM_PROCESSOR} - --outdir ${SHIM_LIB_OUTPUT_DIR} - --dlopen-callback=fdb_shim_dlopen_callback - $ - DEPENDS ${IMPLIBSO_SRC} + --target ${CMAKE_SYSTEM_PROCESSOR} + --outdir ${SHIM_LIB_OUTPUT_DIR} + --dlopen-callback=fdb_shim_dlopen_callback + $ + DEPENDS ${IMPLIBSO_SRC} fdb_c COMMENT "Generating source code for C shim library") - add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp) + add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp) target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack") target_link_libraries(fdb_c_shim PUBLIC dl) target_include_directories(fdb_c_shim PUBLIC @@ -506,12 +526,12 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer add_test(NAME fdb_c_shim_library_tests COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py - --build-dir ${CMAKE_BINARY_DIR} - --unit-tests-bin $ - --api-tester-bin $ - --shim-lib-tester-bin $ - --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - ) + --build-dir ${CMAKE_BINARY_DIR} + --unit-tests-bin $ + --api-tester-bin $ + --shim-lib-tester-bin $ + --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + ) endif() # End Linux only, non-sanitizer only @@ -560,16 +580,16 @@ fdb_install( if(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-sanitizer only -fdb_install( - FILES foundationdb/fdb_c_shim.h - DESTINATION include - DESTINATION_SUFFIX /foundationdb - COMPONENT clients) + fdb_install( + FILES foundationdb/fdb_c_shim.h + DESTINATION include + DESTINATION_SUFFIX /foundationdb + COMPONENT clients) -fdb_install( - TARGETS fdb_c_shim - EXPORT ${targets_export_name} - DESTINATION lib - COMPONENT clients) + fdb_install( + TARGETS fdb_c_shim + EXPORT ${targets_export_name} + DESTINATION lib + COMPONENT clients) endif() # End Linux only, non-ubsan only diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index fdba399204..4b225ddd80 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -324,6 +324,15 @@ extern "C" DLLEXPORT fdb_error_t fdb_future_get_key_array(FDBFuture* f, FDBKey c *out_count = na.size();); } +extern "C" DLLEXPORT fdb_error_t fdb_future_get_granule_summary_array(FDBFuture* f, + FDBGranuleSummary const** out_ranges, + int* out_count) { + CATCH_AND_RETURN(Standalone> na = + TSAV(Standalone>, f)->get(); + *out_ranges = (FDBGranuleSummary*)na.begin(); + *out_count = na.size();); +} + extern "C" DLLEXPORT void fdb_result_destroy(FDBResult* r) { CATCH_AND_DIE(TSAVB(r)->cancel();); } @@ -539,10 +548,14 @@ extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_rang uint8_t const* end_key_name, int end_key_name_length, int64_t version) { + Optional rv; + if (version != latestVersion) { + rv = version; + } return (FDBFuture*)(DB(db) ->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), StringRef(end_key_name, end_key_name_length)), - version) + rv) .extractPtr()); } @@ -943,6 +956,74 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr());); } +extern "C" DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut) { + Optional rv; + if (readVersion != latestVersion) { + rv = readVersion; + } + return (FDBFuture*)(TXN(tr) + ->readBlobGranulesStart(KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length), + KeyRef(end_key_name, end_key_name_length)), + beginVersion, + rv, + readVersionOut) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr, + FDBFuture* f, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granule_context) { + // FIXME: better way to convert? + ReadBlobGranuleContext context; + context.userContext = granule_context->userContext; + context.start_load_f = granule_context->start_load_f; + context.get_load_f = granule_context->get_load_f; + context.free_load_f = granule_context->free_load_f; + context.debugNoMaterialize = granule_context->debugNoMaterialize; + context.granuleParallelism = granule_context->granuleParallelism; + ThreadFuture>> startFuture( + TSAV(Standalone>, f)); + + return (FDBResult*)(TXN(tr) + ->readBlobGranulesFinish(startFuture, + KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length), + KeyRef(end_key_name, end_key_name_length)), + beginVersion, + readVersion, + context) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_transaction_summarize_blob_granules(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t summaryVersion, + int rangeLimit) { + RETURN_FUTURE_ON_ERROR( + Standalone>, + KeyRangeRef range(KeyRef(begin_key_name, begin_key_name_length), KeyRef(end_key_name, end_key_name_length)); + + Optional sv; + if (summaryVersion != latestVersion) { sv = summaryVersion; } + + return (FDBFuture*)(TXN(tr)->summarizeBlobGranules(range, sv, rangeLimit).extractPtr());); +} + #include "fdb_c_function_pointers.g.h" #define FDB_API_CHANGED(func, ver) \ diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 10534a94dc..2e4e977d76 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -84,12 +84,12 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_network_set_option(FDBNetworkOption int value_length); #if FDB_API_VERSION >= 14 -DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_setup_network(); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_setup_network(void); #endif -DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_run_network(); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_run_network(void); -DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_stop_network(); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_stop_network(void); DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_add_network_thread_completion_hook(void (*hook)(void*), void* hook_parameter); @@ -179,6 +179,14 @@ typedef struct keyrange { const uint8_t* end_key; int end_key_length; } FDBKeyRange; + +typedef struct granulesummary { + FDBKeyRange key_range; + int64_t snapshot_version; + int64_t snapshot_size; + int64_t delta_version; + int64_t delta_size; +} FDBGranuleSummary; #pragma pack(pop) typedef struct readgranulecontext { @@ -264,6 +272,10 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_keyrange_array(FDBFuture FDBKeyRange const** out_ranges, int* out_count); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_granule_summary_array(FDBFuture* f, + FDBGranuleSummary const** out_summaries, + int* out_count); + /* FDBResult is a synchronous computation result, as opposed to a future that is asynchronous. */ DLLEXPORT void fdb_result_destroy(FDBResult* r); @@ -521,6 +533,14 @@ DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTr int64_t readVersion, FDBReadBlobGranuleContext granuleContext); +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_summarize_blob_granules(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t summaryVersion, + int rangeLimit); + #define FDB_KEYSEL_LAST_LESS_THAN(k, l) k, l, 0, 0 #define FDB_KEYSEL_LAST_LESS_OR_EQUAL(k, l) k, l, 1, 0 #define FDB_KEYSEL_FIRST_GREATER_THAN(k, l) k, l, 1, 1 @@ -528,8 +548,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTr DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_select_api_version_impl(int runtime_version, int header_version); -DLLEXPORT int fdb_get_max_api_version(); -DLLEXPORT const char* fdb_get_client_version(); +DLLEXPORT int fdb_get_max_api_version(void); +DLLEXPORT const char* fdb_get_client_version(void); /* LEGACY API VERSIONS */ diff --git a/bindings/c/foundationdb/fdb_c_internal.h b/bindings/c/foundationdb/fdb_c_internal.h index b7dcd3aa37..62b77f354e 100644 --- a/bindings/c/foundationdb/fdb_c_internal.h +++ b/bindings/c/foundationdb/fdb_c_internal.h @@ -51,6 +51,27 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_create_database_from_connection_str DLLEXPORT void fdb_use_future_protocol_version(); +// the logical read_blob_granules is broken out (at different points depending on the client type) into the asynchronous +// start() that happens on the fdb network thread, and synchronous finish() that happens off it +DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut); + +DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr, + FDBFuture* f, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granuleContext); + #ifdef __cplusplus } #endif diff --git a/bindings/c/test/apitester/TesterApiWorkload.cpp b/bindings/c/test/apitester/TesterApiWorkload.cpp index c1499adb0c..a51b7dc03a 100644 --- a/bindings/c/test/apitester/TesterApiWorkload.cpp +++ b/bindings/c/test/apitester/TesterApiWorkload.cpp @@ -41,6 +41,10 @@ ApiWorkload::ApiWorkload(const WorkloadConfig& config) : WorkloadBase(config) { stopReceived = false; checkingProgress = false; apiVersion = config.apiVersion; + + for (int i = 0; i < config.numTenants; ++i) { + tenants.push_back(fdb::ByteString(fdb::toBytesRef("tenant" + std::to_string(i)))); + } } IWorkloadControlIfc* ApiWorkload::getControlIfc() { @@ -107,49 +111,57 @@ void ApiWorkload::randomOperation(TTaskFct cont) { } fdb::Key ApiWorkload::randomKeyName() { - return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength); + return keyPrefix + Random::get().randomByteStringLowerCase(minKeyLength, maxKeyLength); } fdb::Value ApiWorkload::randomValue() { - return Random::get().randomStringLowerCase(minValueLength, maxValueLength); + return Random::get().randomByteStringLowerCase(minValueLength, maxValueLength); } -fdb::Key ApiWorkload::randomNotExistingKey() { +fdb::Key ApiWorkload::randomNotExistingKey(std::optional tenantId) { while (true) { fdb::Key key = randomKeyName(); - if (!store.exists(key)) { + if (!stores[tenantId].exists(key)) { return key; } } } -fdb::Key ApiWorkload::randomExistingKey() { +fdb::Key ApiWorkload::randomExistingKey(std::optional tenantId) { fdb::Key genKey = randomKeyName(); - fdb::Key key = store.getKey(genKey, true, 1); - if (key != store.endKey()) { + fdb::Key key = stores[tenantId].getKey(genKey, true, 1); + if (key != stores[tenantId].endKey()) { return key; } - key = store.getKey(genKey, true, 0); - if (key != store.startKey()) { + key = stores[tenantId].getKey(genKey, true, 0); + if (key != stores[tenantId].startKey()) { return key; } info("No existing key found, using a new random key."); return genKey; } -fdb::Key ApiWorkload::randomKey(double existingKeyRatio) { +fdb::Key ApiWorkload::randomKey(double existingKeyRatio, std::optional tenantId) { if (Random::get().randomBool(existingKeyRatio)) { - return randomExistingKey(); + return randomExistingKey(tenantId); } else { - return randomNotExistingKey(); + return randomNotExistingKey(tenantId); } } -void ApiWorkload::populateDataTx(TTaskFct cont) { +std::optional ApiWorkload::randomTenant() { + if (tenants.size() > 0) { + return Random::get().randomInt(0, tenants.size() - 1); + } else { + return {}; + } +} + +void ApiWorkload::populateDataTx(TTaskFct cont, std::optional tenantId) { int numKeys = maxKeysPerTransaction; auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(), randomValue() }); + kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(tenantId), randomValue() }); } execTransaction( [kvPairs](auto ctx) { @@ -158,37 +170,89 @@ void ApiWorkload::populateDataTx(TTaskFct cont) { } ctx->commit(); }, - [this, kvPairs, cont]() { + [this, tenantId, kvPairs, cont]() { for (const fdb::KeyValue& kv : *kvPairs) { - store.set(kv.key, kv.value); + stores[tenantId].set(kv.key, kv.value); } schedule(cont); - }); + }, + getTenant(tenantId)); +} + +void ApiWorkload::clearTenantData(TTaskFct cont, std::optional tenantId) { + execTransaction( + [this](auto ctx) { + ctx->tx().clearRange(keyPrefix, keyPrefix + fdb::Key(1, '\xff')); + ctx->commit(); + }, + [this, tenantId, cont]() { + if (tenantId && tenantId.value() < tenants.size() - 1) { + clearTenantData(cont, tenantId.value() + 1); + } else { + schedule(cont); + } + }, + getTenant(tenantId)); } void ApiWorkload::clearData(TTaskFct cont) { execTransaction( [this](auto ctx) { + // Make this self-conflicting, so that if we're retrying on timeouts + // once we get a successful commit all previous attempts are no + // longer in-flight. + ctx->tx().addReadConflictRange(keyPrefix, keyPrefix + fdb::Key(1, '\xff')); ctx->tx().clearRange(keyPrefix, keyPrefix + fdb::Key(1, '\xff')); ctx->commit(); }, [this, cont]() { schedule(cont); }); } -void ApiWorkload::populateData(TTaskFct cont) { - if (store.size() < initialSize) { - populateDataTx([this, cont]() { populateData(cont); }); - } else { +void ApiWorkload::populateTenantData(TTaskFct cont, std::optional tenantId) { + while (stores[tenantId].size() >= initialSize && tenantId && tenantId.value() < tenants.size()) { + ++tenantId.value(); + } + + if (tenantId >= tenants.size() || stores[tenantId].size() >= initialSize) { info("Data population completed"); schedule(cont); + } else { + populateDataTx([this, cont, tenantId]() { populateTenantData(cont, tenantId); }, tenantId); } } -void ApiWorkload::randomInsertOp(TTaskFct cont) { +void ApiWorkload::createTenants(TTaskFct cont) { + execTransaction( + [this](auto ctx) { + auto futures = std::make_shared>(); + for (auto tenant : tenants) { + futures->push_back(fdb::Tenant::getTenant(ctx->tx(), tenant)); + } + ctx->continueAfterAll(*futures, [this, ctx, futures]() { + for (int i = 0; i < futures->size(); ++i) { + if (!(*futures)[i].get()) { + fdb::Tenant::createTenant(ctx->tx(), tenants[i]); + } + } + ctx->commit(); + }); + }, + [this, cont]() { schedule(cont); }); +} + +void ApiWorkload::populateData(TTaskFct cont) { + if (tenants.size() > 0) { + createTenants([this, cont]() { populateTenantData(cont, std::make_optional(0)); }); + } else { + populateTenantData(cont, {}); + } +} + +void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(), randomValue() }); + kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(tenantId), randomValue() }); } execTransaction( [kvPairs](auto ctx) { @@ -197,19 +261,20 @@ void ApiWorkload::randomInsertOp(TTaskFct cont) { } ctx->commit(); }, - [this, kvPairs, cont]() { + [this, kvPairs, cont, tenantId]() { for (const fdb::KeyValue& kv : *kvPairs) { - store.set(kv.key, kv.value); + stores[tenantId].set(kv.key, kv.value); } schedule(cont); - }); + }, + getTenant(tenantId)); } -void ApiWorkload::randomClearOp(TTaskFct cont) { +void ApiWorkload::randomClearOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomExistingKey()); + keys->push_back(randomExistingKey(tenantId)); } execTransaction( [keys](auto ctx) { @@ -218,15 +283,16 @@ void ApiWorkload::randomClearOp(TTaskFct cont) { } ctx->commit(); }, - [this, keys, cont]() { + [this, keys, cont, tenantId]() { for (const auto& key : *keys) { - store.clear(key); + stores[tenantId].clear(key); } schedule(cont); - }); + }, + getTenant(tenantId)); } -void ApiWorkload::randomClearRangeOp(TTaskFct cont) { +void ApiWorkload::randomClearRangeOp(TTaskFct cont, std::optional tenantId) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); if (begin > end) { @@ -237,10 +303,19 @@ void ApiWorkload::randomClearRangeOp(TTaskFct cont) { ctx->tx().clearRange(begin, end); ctx->commit(); }, - [this, begin, end, cont]() { - store.clear(begin, end); + [this, begin, end, cont, tenantId]() { + stores[tenantId].clear(begin, end); schedule(cont); - }); + }, + getTenant(tenantId)); +} + +std::optional ApiWorkload::getTenant(std::optional tenantId) { + if (tenantId) { + return tenants[*tenantId]; + } else { + return {}; + } } } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterApiWorkload.h b/bindings/c/test/apitester/TesterApiWorkload.h index fd3630ceee..a3a13e964d 100644 --- a/bindings/c/test/apitester/TesterApiWorkload.h +++ b/bindings/c/test/apitester/TesterApiWorkload.h @@ -96,17 +96,23 @@ protected: // Key prefix fdb::Key keyPrefix; + // The number of tenants to configure in the cluster + std::vector tenants; + // In-memory store maintaining expected database state - KeyValueStore store; + std::unordered_map, KeyValueStore> stores; ApiWorkload(const WorkloadConfig& config); // Methods for generating random keys and values fdb::Key randomKeyName(); fdb::Value randomValue(); - fdb::Key randomNotExistingKey(); - fdb::Key randomExistingKey(); - fdb::Key randomKey(double existingKeyRatio); + fdb::Key randomNotExistingKey(std::optional tenantId); + fdb::Key randomExistingKey(std::optional tenantId); + fdb::Key randomKey(double existingKeyRatio, std::optional tenantId); + + // Chooses a random tenant from the available tenants (or an empty optional if tenants aren't used in the test) + std::optional randomTenant(); // Generate initial random data for the workload void populateData(TTaskFct cont); @@ -115,12 +121,18 @@ protected: void clearData(TTaskFct cont); // common operations - void randomInsertOp(TTaskFct cont); - void randomClearOp(TTaskFct cont); - void randomClearRangeOp(TTaskFct cont); + void randomInsertOp(TTaskFct cont, std::optional tenantId); + void randomClearOp(TTaskFct cont, std::optional tenantId); + void randomClearRangeOp(TTaskFct cont, std::optional tenantId); + + std::optional getTenant(std::optional tenantId); private: - void populateDataTx(TTaskFct cont); + void populateDataTx(TTaskFct cont, std::optional tenantId); + void populateTenantData(TTaskFct cont, std::optional tenantId); + void createTenants(TTaskFct cont); + + void clearTenantData(TTaskFct cont, std::optional tenantId); void randomOperations(); }; diff --git a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp index 97e4e5bf01..f6164296da 100644 --- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp @@ -18,61 +18,13 @@ * limitations under the License. */ #include "TesterApiWorkload.h" +#include "TesterBlobGranuleUtil.h" #include "TesterUtil.h" #include #include namespace FdbApiTester { -class TesterGranuleContext { -public: - std::unordered_map loadsInProgress; - int64_t nextId = 0; - std::string basePath; - - ~TesterGranuleContext() { - // if there was an error or not all loads finished, delete data - for (auto& it : loadsInProgress) { - uint8_t* dataToFree = it.second; - delete[] dataToFree; - } - } -}; - -static int64_t granule_start_load(const char* filename, - int filenameLength, - int64_t offset, - int64_t length, - int64_t fullFileLength, - void* context) { - - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - int64_t loadId = ctx->nextId++; - - uint8_t* buffer = new uint8_t[length]; - std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary); - fin.seekg(offset); - fin.read((char*)buffer, length); - - ctx->loadsInProgress.insert({ loadId, buffer }); - - return loadId; -} - -static uint8_t* granule_get_load(int64_t loadId, void* context) { - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - return ctx->loadsInProgress.at(loadId); -} - -static void granule_free_load(int64_t loadId, void* context) { - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - auto it = ctx->loadsInProgress.find(loadId); - uint8_t* dataToFree = it->second; - delete[] dataToFree; - - ctx->loadsInProgress.erase(it); -} - class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload { public: ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) { @@ -83,34 +35,39 @@ public: } private: - enum OpType { OP_INSERT, OP_CLEAR, OP_CLEAR_RANGE, OP_READ, OP_GET_RANGES, OP_LAST = OP_GET_RANGES }; + // FIXME: use other new blob granule apis! + enum OpType { + OP_INSERT, + OP_CLEAR, + OP_CLEAR_RANGE, + OP_READ, + OP_GET_GRANULES, + OP_SUMMARIZE, + OP_GET_BLOB_RANGES, + OP_VERIFY, + OP_LAST = OP_VERIFY + }; std::vector excludedOpTypes; // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet // FIXME: should still guarantee a read succeeds eventually somehow bool seenReadSuccess = false; - void randomReadOp(TTaskFct cont) { + void randomReadOp(TTaskFct cont, std::optional tenantId) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); - auto results = std::make_shared>(); - auto tooOld = std::make_shared(false); if (begin > end) { std::swap(begin, end); } + + auto results = std::make_shared>(); + auto tooOld = std::make_shared(false); + execTransaction( [this, begin, end, results, tooOld](auto ctx) { ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); - TesterGranuleContext testerContext; - testerContext.basePath = ctx->getBGBasePath(); - - fdb::native::FDBReadBlobGranuleContext granuleContext; - granuleContext.userContext = &testerContext; - granuleContext.debugNoMaterialize = false; - granuleContext.granuleParallelism = 1; - granuleContext.start_load_f = &granule_start_load; - granuleContext.get_load_f = &granule_get_load; - granuleContext.free_load_f = &granule_free_load; + TesterGranuleContext testerContext(ctx->getBGBasePath()); + fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext); fdb::Result res = ctx->tx().readBlobGranules( begin, end, 0 /* beginVersion */, -2 /* latest read version */, granuleContext); @@ -135,9 +92,10 @@ private: ctx->done(); } }, - [this, begin, end, results, tooOld, cont]() { + [this, begin, end, results, tooOld, cont, tenantId]() { if (!*tooOld) { - std::vector expected = store.getRange(begin, end, store.size(), false); + std::vector expected = + stores[tenantId].getRange(begin, end, stores[tenantId].size(), false); if (results->size() != expected.size()) { error(fmt::format("randomReadOp result size mismatch. expected: {} actual: {}", expected.size(), @@ -168,16 +126,18 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } - void randomGetRangesOp(TTaskFct cont) { + void randomGetGranulesOp(TTaskFct cont, std::optional tenantId) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); - auto results = std::make_shared>(); if (begin > end) { std::swap(begin, end); } + auto results = std::make_shared>(); + execTransaction( [begin, end, results](auto ctx) { fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType(); @@ -190,46 +150,180 @@ private: true); }, [this, begin, end, results, cont]() { - if (seenReadSuccess) { - ASSERT(results->size() > 0); - ASSERT(results->front().beginKey <= begin); - ASSERT(results->back().endKey >= end); - } + this->validateRanges(results, begin, end, seenReadSuccess); + schedule(cont); + }, + getTenant(tenantId)); + } + + void randomSummarizeOp(TTaskFct cont, std::optional tenantId) { + if (!seenReadSuccess) { + // tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a + // read success + schedule(cont); + return; + } + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + auto results = std::make_shared>(); + execTransaction( + [begin, end, results](auto ctx) { + fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType(); + ctx->continueAfter( + f, + [ctx, f, results]() { + *results = copyGranuleSummaryArray(f.get()); + ctx->done(); + }, + true); + }, + [this, begin, end, results, cont]() { + ASSERT(results->size() > 0); + ASSERT(results->front().keyRange.beginKey <= begin); + ASSERT(results->back().keyRange.endKey >= end); for (int i = 0; i < results->size(); i++) { - // no empty or inverted ranges - ASSERT((*results)[i].beginKey < (*results)[i].endKey); + // TODO: could do validation of subsequent calls and ensure snapshot version never decreases + ASSERT((*results)[i].keyRange.beginKey < (*results)[i].keyRange.endKey); + ASSERT((*results)[i].snapshotVersion <= (*results)[i].deltaVersion); + ASSERT((*results)[i].snapshotSize > 0); + ASSERT((*results)[i].deltaSize >= 0); } for (int i = 1; i < results->size(); i++) { // ranges contain entire requested key range - ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey); + ASSERT((*results)[i].keyRange.beginKey == (*results)[i - 1].keyRange.endKey); } schedule(cont); - }); + }, + getTenant(tenantId)); + } + + void validateRanges(std::shared_ptr> results, + fdb::Key begin, + fdb::Key end, + bool shouldBeRanges) { + if (shouldBeRanges) { + ASSERT(results->size() > 0); + ASSERT(results->front().beginKey <= begin); + ASSERT(results->back().endKey >= end); + } + for (int i = 0; i < results->size(); i++) { + // no empty or inverted ranges + if ((*results)[i].beginKey >= (*results)[i].endKey) { + error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})", + fdb::toCharsRef((*results)[i].beginKey), + fdb::toCharsRef((*results)[i].endKey), + fdb::toCharsRef(begin), + fdb::toCharsRef(end))); + } + ASSERT((*results)[i].beginKey < (*results)[i].endKey); + } + + for (int i = 1; i < results->size(); i++) { + // ranges contain entire requested key range + if ((*results)[i].beginKey != (*results)[i].endKey) { + error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})", + fdb::toCharsRef((*results)[i].beginKey), + fdb::toCharsRef((*results)[i].endKey), + fdb::toCharsRef(begin), + fdb::toCharsRef(end))); + } + ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey); + } + } + + void randomGetBlobRangesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + auto results = std::make_shared>(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end, results](auto ctx) { + fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType(); + ctx->continueAfter(f, [ctx, f, results]() { + *results = copyKeyRangeArray(f.get()); + ctx->done(); + }); + }, + [this, begin, end, results, cont]() { + this->validateRanges(results, begin, end, seenReadSuccess); + schedule(cont); + }, + /* failOnError = */ false); + } + + void randomVerifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + + auto verifyVersion = std::make_shared(false); + // info("Verify op starting"); + + execOperation( + [begin, end, verifyVersion](auto ctx) { + fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); + ctx->continueAfter(f, [ctx, verifyVersion, f]() { + *verifyVersion = f.get(); + ctx->done(); + }); + }, + [this, begin, end, verifyVersion, cont]() { + if (*verifyVersion == -1) { + ASSERT(!seenReadSuccess); + } else { + if (!seenReadSuccess) { + info("BlobGranuleCorrectness::randomVerifyOp first success"); + } + seenReadSuccess = true; + } + // info(fmt::format("verify op done @ {}", *verifyVersion)); + schedule(cont); + }, + /* failOnError = */ false); } void randomOperation(TTaskFct cont) { - OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); + std::optional tenantId = randomTenant(); + + OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); while (std::count(excludedOpTypes.begin(), excludedOpTypes.end(), txType)) { txType = (OpType)Random::get().randomInt(0, OP_LAST); } + switch (txType) { case OP_INSERT: - randomInsertOp(cont); + randomInsertOp(cont, tenantId); break; case OP_CLEAR: - randomClearOp(cont); + randomClearOp(cont, tenantId); break; case OP_CLEAR_RANGE: - randomClearRangeOp(cont); + randomClearRangeOp(cont, tenantId); break; case OP_READ: - randomReadOp(cont); + randomReadOp(cont, tenantId); break; - case OP_GET_RANGES: - randomGetRangesOp(cont); + case OP_GET_GRANULES: + randomGetGranulesOp(cont, tenantId); + break; + case OP_SUMMARIZE: + randomSummarizeOp(cont, tenantId); + break; + case OP_GET_BLOB_RANGES: + randomGetBlobRangesOp(cont); + break; + case OP_VERIFY: + randomVerifyOp(cont); break; } } diff --git a/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp new file mode 100644 index 0000000000..b4bcaacdc6 --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp @@ -0,0 +1,316 @@ +/* + * TesterBlobGranuleErrorsWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "TesterApiWorkload.h" +#include "TesterBlobGranuleUtil.h" +#include "TesterUtil.h" +#include +#include + +namespace FdbApiTester { + +class BlobGranuleErrorsWorkload : public ApiWorkload { +public: + BlobGranuleErrorsWorkload(const WorkloadConfig& config) : ApiWorkload(config) {} + +private: + enum OpType { + OP_READ_NO_MATERIALIZE, + OP_READ_FILE_LOAD_ERROR, + OP_READ_TOO_OLD, + OP_PURGE_UNALIGNED, + OP_BLOBBIFY_UNALIGNED, + OP_UNBLOBBIFY_UNALIGNED, + OP_CANCEL_GET_GRANULES, + OP_CANCEL_GET_RANGES, + OP_CANCEL_VERIFY, + OP_CANCEL_SUMMARIZE, + OP_CANCEL_BLOBBIFY, + OP_CANCEL_UNBLOBBIFY, + OP_CANCEL_PURGE, + OP_LAST = OP_CANCEL_PURGE + }; + + // could add summarize too old and verify too old as ops if desired but those are lower value + + // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet + // FIXME: should still guarantee a read succeeds eventually somehow + bool seenReadSuccess = false; + + void doErrorOp(TTaskFct cont, + std::string basePathAddition, + bool doMaterialize, + int64_t readVersion, + fdb::native::fdb_error_t expectedError) { + fdb::Key begin = randomKeyName(); + fdb::Key end = begin; + // [K - K) empty range will succeed read because there is trivially nothing to do, so don't do it + while (end == begin) { + end = randomKeyName(); + } + if (begin > end) { + std::swap(begin, end); + } + + execTransaction( + [this, begin, end, basePathAddition, doMaterialize, readVersion, expectedError](auto ctx) { + ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); + + TesterGranuleContext testerContext(ctx->getBGBasePath() + basePathAddition); + fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext); + granuleContext.debugNoMaterialize = !doMaterialize; + + fdb::Result res = + ctx->tx().readBlobGranules(begin, end, 0 /* beginVersion */, readVersion, granuleContext); + auto out = fdb::Result::KeyValueRefArray{}; + fdb::Error err = res.getKeyValueArrayNothrow(out); + + if (err.code() == error_code_success) { + error(fmt::format("Operation succeeded in error test!")); + } + ASSERT(err.code() != error_code_success); + if (err.code() != expectedError) { + info(fmt::format("incorrect error. Expected {}, Got {}", expectedError, err.code())); + if (err.code() == error_code_blob_granule_transaction_too_old) { + ASSERT(!seenReadSuccess); + ctx->done(); + } else { + ctx->onError(err); + } + } else { + if (err.code() != error_code_blob_granule_transaction_too_old) { + seenReadSuccess = true; + } + ctx->done(); + } + }, + [this, cont]() { schedule(cont); }); + } + + void randomOpReadNoMaterialize(TTaskFct cont) { + // ensure setting noMaterialize flag produces blob_granule_not_materialized + doErrorOp(cont, "", false, -2 /*latest read version */, error_code_blob_granule_not_materialized); + } + + void randomOpReadFileLoadError(TTaskFct cont) { + // point to a file path that doesn't exist by adding an extra suffix + doErrorOp(cont, "extrapath/", true, -2 /*latest read version */, error_code_blob_granule_file_load_error); + } + + void randomOpReadTooOld(TTaskFct cont) { + // read at a version (1) that should predate granule data + doErrorOp(cont, "", true, 1, error_code_blob_granule_transaction_too_old); + } + + void randomPurgeUnalignedOp(TTaskFct cont) { + // blobbify/unblobbify need to be aligned to blob range boundaries, so this should always fail + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [this, begin, end](auto ctx) { + fdb::Future f = ctx->db().purgeBlobGranules(begin, end, -2, false).eraseType(); + ctx->continueAfter( + f, + [this, ctx, f]() { + info(fmt::format("unaligned purge got {}", f.error().code())); + ASSERT(f.error().code() == error_code_unsupported_operation); + ctx->done(); + }, + true); + }, + [this, cont]() { schedule(cont); }); + } + + void randomBlobbifyUnalignedOp(bool blobbify, TTaskFct cont) { + // blobbify/unblobbify need to be aligned to blob range boundaries, so this should always return false + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + auto success = std::make_shared(false); + execOperation( + [begin, end, blobbify, success](auto ctx) { + fdb::Future f = blobbify ? ctx->db().blobbifyRange(begin, end).eraseType() + : ctx->db().unblobbifyRange(begin, end).eraseType(); + ctx->continueAfter( + f, + [ctx, f, success]() { + *success = f.get(); + ctx->done(); + }, + true); + }, + [this, cont, success]() { + ASSERT(!(*success)); + schedule(cont); + }); + } + + void randomCancelGetGranulesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelGetRangesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelVerifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelSummarizeOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelBlobbifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().blobbifyRange(begin, end).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelUnblobbifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().unblobbifyRange(begin, end).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelPurgeOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().purgeBlobGranules(begin, end, -2, false).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomOperation(TTaskFct cont) override { + OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { + case OP_READ_NO_MATERIALIZE: + randomOpReadNoMaterialize(cont); + break; + case OP_READ_FILE_LOAD_ERROR: + randomOpReadFileLoadError(cont); + break; + case OP_READ_TOO_OLD: + randomOpReadTooOld(cont); + break; + case OP_PURGE_UNALIGNED: + // gets the correct error but it doesn't propagate properly in the test + // randomPurgeUnalignedOp(cont); + break; + case OP_BLOBBIFY_UNALIGNED: + randomBlobbifyUnalignedOp(true, cont); + break; + case OP_UNBLOBBIFY_UNALIGNED: + randomBlobbifyUnalignedOp(false, cont); + break; + case OP_CANCEL_GET_GRANULES: + randomCancelGetGranulesOp(cont); + break; + case OP_CANCEL_GET_RANGES: + randomCancelGetRangesOp(cont); + break; + case OP_CANCEL_VERIFY: + randomCancelVerifyOp(cont); + break; + case OP_CANCEL_SUMMARIZE: + randomCancelSummarizeOp(cont); + break; + case OP_CANCEL_BLOBBIFY: + randomCancelBlobbifyOp(cont); + break; + case OP_CANCEL_UNBLOBBIFY: + randomCancelUnblobbifyOp(cont); + break; + case OP_CANCEL_PURGE: + randomCancelPurgeOp(cont); + break; + } + } +}; + +WorkloadFactory BlobGranuleErrorsWorkloadFactory("BlobGranuleErrors"); + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp new file mode 100644 index 0000000000..a908a9c0bf --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp @@ -0,0 +1,80 @@ +/* + * TesterBlobGranuleUtil.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterBlobGranuleUtil.h" +#include "TesterUtil.h" +#include + +namespace FdbApiTester { + +// FIXME: avoid duplicating this between files! +static int64_t granule_start_load(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* context) { + + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + int64_t loadId = ctx->nextId++; + + uint8_t* buffer = new uint8_t[length]; + std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary); + if (fin.fail()) { + delete[] buffer; + buffer = nullptr; + } else { + fin.seekg(offset); + fin.read((char*)buffer, length); + } + + ctx->loadsInProgress.insert({ loadId, buffer }); + + return loadId; +} + +static uint8_t* granule_get_load(int64_t loadId, void* context) { + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + return ctx->loadsInProgress.at(loadId); +} + +static void granule_free_load(int64_t loadId, void* context) { + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + auto it = ctx->loadsInProgress.find(loadId); + uint8_t* dataToFree = it->second; + delete[] dataToFree; + + ctx->loadsInProgress.erase(it); +} + +fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext) { + fdb::native::FDBReadBlobGranuleContext granuleContext; + + granuleContext.userContext = (void*)testerContext; + granuleContext.debugNoMaterialize = false; + granuleContext.granuleParallelism = 1 + Random::get().randomInt(0, 3); + granuleContext.start_load_f = &granule_start_load; + granuleContext.get_load_f = &granule_get_load; + granuleContext.free_load_f = &granule_free_load; + + return granuleContext; +} + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.h b/bindings/c/test/apitester/TesterBlobGranuleUtil.h new file mode 100644 index 0000000000..7b4b0dba81 --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.h @@ -0,0 +1,49 @@ +/* + * TesterBlobGranuleUtil.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef APITESTER_BLOBGRANULE_UTIL_H +#define APITESTER_BLOBGRANULE_UTIL_H +#include "TesterUtil.h" +#include "test/fdb_api.hpp" +#include + +namespace FdbApiTester { + +class TesterGranuleContext { +public: + std::unordered_map loadsInProgress; + std::string basePath; + int64_t nextId; + + TesterGranuleContext(const std::string& basePath) : basePath(basePath), nextId(0) {} + + ~TesterGranuleContext() { + // this should now never happen with proper memory management + ASSERT(loadsInProgress.empty()); + } +}; + +fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext); + +} // namespace FdbApiTester + +#endif diff --git a/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp index b569cdb35f..b4cd205143 100644 --- a/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp +++ b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp @@ -31,11 +31,11 @@ private: enum OpType { OP_CANCEL_GET, OP_CANCEL_AFTER_FIRST_GET, OP_LAST = OP_CANCEL_AFTER_FIRST_GET }; // Start multiple concurrent gets and cancel the transaction - void randomCancelGetTx(TTaskFct cont) { + void randomCancelGetTx(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomKey(readExistingKeysRatio)); + keys->push_back(randomKey(readExistingKeysRatio, tenantId)); } execTransaction( [keys](auto ctx) { @@ -45,25 +45,26 @@ private: } ctx->done(); }, - [this, cont]() { schedule(cont); }); + [this, cont]() { schedule(cont); }, + getTenant(tenantId)); } // Start multiple concurrent gets and cancel the transaction after the first get returns - void randomCancelAfterFirstResTx(TTaskFct cont) { + void randomCancelAfterFirstResTx(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomKey(readExistingKeysRatio)); + keys->push_back(randomKey(readExistingKeysRatio, tenantId)); } execTransaction( - [this, keys](auto ctx) { + [this, keys, tenantId](auto ctx) { std::vector futures; for (const auto& key : *keys) { futures.push_back(ctx->tx().get(key, false).eraseType()); } for (int i = 0; i < keys->size(); i++) { fdb::Future f = futures[i]; - auto expectedVal = store.get((*keys)[i]); + auto expectedVal = stores[tenantId].get((*keys)[i]); ctx->continueAfter(f, [expectedVal, f, this, ctx]() { auto val = f.get(); if (expectedVal != val) { @@ -75,17 +76,20 @@ private: }); } }, - [this, cont]() { schedule(cont); }); + [this, cont]() { schedule(cont); }, + getTenant(tenantId)); } void randomOperation(TTaskFct cont) override { + std::optional tenantId = randomTenant(); OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { case OP_CANCEL_GET: - randomCancelGetTx(cont); + randomCancelGetTx(cont, tenantId); break; case OP_CANCEL_AFTER_FIRST_GET: - randomCancelAfterFirstResTx(cont); + randomCancelAfterFirstResTx(cont, tenantId); break; } } diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 9219bb7056..4486abdf97 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -41,11 +41,11 @@ private: OP_LAST = OP_COMMIT_READ }; - void randomCommitReadOp(TTaskFct cont) { + void randomCommitReadOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - kvPairs->push_back(fdb::KeyValue{ randomKey(readExistingKeysRatio), randomValue() }); + kvPairs->push_back(fdb::KeyValue{ randomKey(readExistingKeysRatio, tenantId), randomValue() }); } execTransaction( [kvPairs](auto ctx) { @@ -54,9 +54,9 @@ private: } ctx->commit(); }, - [this, kvPairs, cont]() { + [this, kvPairs, cont, tenantId]() { for (const fdb::KeyValue& kv : *kvPairs) { - store.set(kv.key, kv.value); + stores[tenantId].set(kv.key, kv.value); } auto results = std::make_shared>>(); execTransaction( @@ -78,10 +78,10 @@ private: ctx->done(); }); }, - [this, kvPairs, results, cont]() { + [this, kvPairs, results, cont, tenantId]() { ASSERT(results->size() == kvPairs->size()); for (int i = 0; i < kvPairs->size(); i++) { - auto expected = store.get((*kvPairs)[i].key); + auto expected = stores[tenantId].get((*kvPairs)[i].key); auto actual = (*results)[i]; if (actual != expected) { error( @@ -93,16 +93,18 @@ private: } } schedule(cont); - }); - }); + }, + getTenant(tenantId)); + }, + getTenant(tenantId)); } - void randomGetOp(TTaskFct cont) { + void randomGetOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); auto results = std::make_shared>>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomKey(readExistingKeysRatio)); + keys->push_back(randomKey(readExistingKeysRatio, tenantId)); } execTransaction( [keys, results](auto ctx) { @@ -119,10 +121,10 @@ private: ctx->done(); }); }, - [this, keys, results, cont]() { + [this, keys, results, cont, tenantId]() { ASSERT(results->size() == keys->size()); for (int i = 0; i < keys->size(); i++) { - auto expected = store.get((*keys)[i]); + auto expected = stores[tenantId].get((*keys)[i]); if ((*results)[i] != expected) { error(fmt::format("randomGetOp mismatch. key: {} expected: {:.80} actual: {:.80}", fdb::toCharsRef((*keys)[i]), @@ -131,16 +133,17 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } - void randomGetKeyOp(TTaskFct cont) { + void randomGetKeyOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keysWithSelectors = std::make_shared>>(); auto results = std::make_shared>(); keysWithSelectors->reserve(numKeys); for (int i = 0; i < numKeys; i++) { - auto key = randomKey(readExistingKeysRatio); + auto key = randomKey(readExistingKeysRatio, tenantId); fdb::KeySelector selector; selector.keyLength = key.size(); selector.orEqual = Random::get().randomBool(0.5); @@ -169,20 +172,20 @@ private: ctx->done(); }); }, - [this, keysWithSelectors, results, cont]() { + [this, keysWithSelectors, results, cont, tenantId]() { ASSERT(results->size() == keysWithSelectors->size()); for (int i = 0; i < keysWithSelectors->size(); i++) { auto const& key = (*keysWithSelectors)[i].first; auto const& selector = (*keysWithSelectors)[i].second; - auto expected = store.getKey(key, selector.orEqual, selector.offset); + auto expected = stores[tenantId].getKey(key, selector.orEqual, selector.offset); auto actual = (*results)[i]; // Local store only contains data for the current client, while fdb contains data from multiple // clients. If getKey returned a key outside of the range for the current client, adjust the result // to match what would be expected in the local store. if (actual.substr(0, keyPrefix.size()) < keyPrefix) { - actual = store.startKey(); + actual = stores[tenantId].startKey(); } else if ((*results)[i].substr(0, keyPrefix.size()) > keyPrefix) { - actual = store.endKey(); + actual = stores[tenantId].endKey(); } if (actual != expected) { error(fmt::format("randomGetKeyOp mismatch. key: {}, orEqual: {}, offset: {}, expected: {} " @@ -195,37 +198,38 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } void getRangeLoop(std::shared_ptr ctx, fdb::KeySelector begin, - fdb::KeySelector end, + fdb::Key endKey, std::shared_ptr> results) { auto f = ctx->tx().getRange(begin, - end, + fdb::key_select::firstGreaterOrEqual(endKey), 0 /*limit*/, 0 /*target_bytes*/, FDB_STREAMING_MODE_WANT_ALL, 0 /*iteration*/, false /*snapshot*/, false /*reverse*/); - ctx->continueAfter(f, [this, ctx, f, end, results]() { + ctx->continueAfter(f, [this, ctx, f, endKey, results]() { auto out = copyKeyValueArray(f.get()); results->insert(results->end(), out.first.begin(), out.first.end()); const bool more = out.second; if (more) { // Fetch the remaining results. - getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), end, results); + getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), endKey, results); } else { ctx->done(); } }); } - void randomGetRangeOp(TTaskFct cont) { - auto begin = randomKey(readExistingKeysRatio); - auto end = randomKey(readExistingKeysRatio); + void randomGetRangeOp(TTaskFct cont, std::optional tenantId) { + auto begin = randomKey(readExistingKeysRatio, tenantId); + auto end = randomKey(readExistingKeysRatio, tenantId); auto results = std::make_shared>(); execTransaction( @@ -233,13 +237,10 @@ private: // Clear the results vector, in case the transaction is retried. results->clear(); - getRangeLoop(ctx, - fdb::key_select::firstGreaterOrEqual(begin), - fdb::key_select::firstGreaterOrEqual(end), - results); + getRangeLoop(ctx, fdb::key_select::firstGreaterOrEqual(begin), end, results); }, - [this, begin, end, results, cont]() { - auto expected = store.getRange(begin, end, results->size() + 10, false); + [this, begin, end, results, cont, tenantId]() { + auto expected = stores[tenantId].getRange(begin, end, results->size() + 10, false); if (results->size() != expected.size()) { error(fmt::format("randomGetRangeOp mismatch. expected {} keys, actual {} keys", expected.size(), @@ -260,32 +261,35 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } void randomOperation(TTaskFct cont) { - OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); + std::optional tenantId = randomTenant(); + OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { case OP_INSERT: - randomInsertOp(cont); + randomInsertOp(cont, tenantId); break; case OP_GET: - randomGetOp(cont); + randomGetOp(cont, tenantId); break; case OP_GET_KEY: - randomGetKeyOp(cont); + randomGetKeyOp(cont, tenantId); break; case OP_CLEAR: - randomClearOp(cont); + randomClearOp(cont, tenantId); break; case OP_GET_RANGE: - randomGetRangeOp(cont); + randomGetRangeOp(cont, tenantId); break; case OP_CLEAR_RANGE: - randomClearRangeOp(cont); + randomClearRangeOp(cont, tenantId); break; case OP_COMMIT_READ: - randomCommitReadOp(cont); + randomCommitReadOp(cont, tenantId); break; } } diff --git a/bindings/c/test/apitester/TesterExampleWorkload.cpp b/bindings/c/test/apitester/TesterExampleWorkload.cpp index 3765dc50fb..882fdc62e4 100644 --- a/bindings/c/test/apitester/TesterExampleWorkload.cpp +++ b/bindings/c/test/apitester/TesterExampleWorkload.cpp @@ -35,8 +35,8 @@ public: void start() override { setAndGet(NO_OP_TASK); } void setAndGet(TTaskFct cont) { - fdb::Key key = keyPrefix + random.randomStringLowerCase(10, 100); - fdb::Value value = random.randomStringLowerCase(10, 1000); + fdb::Key key = keyPrefix + random.randomByteStringLowerCase(10, 100); + fdb::Value value = random.randomByteStringLowerCase(10, 1000); execTransaction( [key, value](auto ctx) { ctx->tx().set(key, value); diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 1160b696b0..7c7d0fc948 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -49,6 +49,7 @@ public: int numClientThreads; int numDatabases; int numClients; + int numTenants = -1; int statsIntervalMs = 0; std::vector> knobs; TestSpec testSpec; diff --git a/bindings/c/test/apitester/TesterTestSpec.cpp b/bindings/c/test/apitester/TesterTestSpec.cpp index 86a89c9116..1048aab493 100644 --- a/bindings/c/test/apitester/TesterTestSpec.cpp +++ b/bindings/c/test/apitester/TesterTestSpec.cpp @@ -65,6 +65,10 @@ std::unordered_mapdatabasePerTransaction = (value == "true"); } }, + { "tamperClusterFile", + [](const std::string& value, TestSpec* spec) { // + spec->tamperClusterFile = (value == "true"); + } }, { "minFdbThreads", [](const std::string& value, TestSpec* spec) { // processIntOption(value, "minFdbThreads", spec->minFdbThreads, 1, 1000); @@ -96,6 +100,18 @@ std::unordered_mapmaxClients, 1, 1000); + } }, + { "disableClientBypass", + [](const std::string& value, TestSpec* spec) { // + spec->disableClientBypass = (value == "true"); + } }, + { "minTenants", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "minTenants", spec->minTenants, 1, 1000); + } }, + { "maxTenants", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "maxTenants", spec->maxTenants, 1, 1000); } } }; diff --git a/bindings/c/test/apitester/TesterTestSpec.h b/bindings/c/test/apitester/TesterTestSpec.h index be7a573033..c0e9c0caf1 100644 --- a/bindings/c/test/apitester/TesterTestSpec.h +++ b/bindings/c/test/apitester/TesterTestSpec.h @@ -58,6 +58,9 @@ struct TestSpec { // Execute each transaction in a separate database instance bool databasePerTransaction = false; + // Test tampering the cluster file + bool tamperClusterFile = false; + // Size of the FDB client thread pool (a random number in the [min,max] range) int minFdbThreads = 1; int maxFdbThreads = 1; @@ -75,6 +78,13 @@ struct TestSpec { int minClients = 1; int maxClients = 10; + // Disable the ability to bypass the MVC API, for + // cases when there are no external clients + bool disableClientBypass = false; + // Number of tenants (a random number in the [min,max] range) + int minTenants = 0; + int maxTenants = 0; + // List of workloads with their options std::vector workloads; }; diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 221774854d..547f6b4965 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -23,25 +23,23 @@ #include "foundationdb/fdb_c_types.h" #include "test/apitester/TesterScheduler.h" #include "test/fdb_api.hpp" +#include #include #include +#include #include #include #include #include #include #include +#include namespace FdbApiTester { constexpr int LONG_WAIT_TIME_US = 2000000; constexpr int LARGE_NUMBER_OF_RETRIES = 10; -void TransactionActorBase::complete(fdb::Error err) { - error = err; - context = {}; -} - void ITransactionContext::continueAfterAll(std::vector futures, TTaskFct cont) { auto counter = std::make_shared>(futures.size()); auto errorCode = std::make_shared>(fdb::Error::success()); @@ -72,20 +70,44 @@ void ITransactionContext::continueAfterAll(std::vector futures, TTa */ class TransactionContextBase : public ITransactionContext { public: - TransactionContextBase(fdb::Transaction tx, - std::shared_ptr txActor, - TTaskFct cont, + TransactionContextBase(ITransactionExecutor* executor, + TOpStartFct startFct, + TOpContFct cont, IScheduler* scheduler, int retryLimit, - std::string bgBasePath) - : fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit), - txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath) {} + std::string bgBasePath, + std::optional tenantName, + bool transactional) + : executor(executor), startFct(startFct), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit), + txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath), tenantName(tenantName), + transactional(transactional) { + databaseCreateErrorInjected = executor->getOptions().injectDatabaseCreateErrors && + Random::get().randomBool(executor->getOptions().databaseCreateErrorRatio); + if (databaseCreateErrorInjected) { + fdbDb = fdb::Database(executor->getClusterFileForErrorInjection()); + } else { + fdbDb = executor->selectDatabase(); + } + + if (transactional) { + if (tenantName) { + fdb::Tenant tenant = fdbDb.openTenant(*tenantName); + fdbTx = tenant.createTransaction(); + } else { + fdbTx = fdbDb.createTransaction(); + } + } + } + + virtual ~TransactionContextBase() { ASSERT(txState == TxState::DONE); } // A state machine: // IN_PROGRESS -> (ON_ERROR -> IN_PROGRESS)* [-> ON_ERROR] -> DONE enum class TxState { IN_PROGRESS, ON_ERROR, DONE }; - fdb::Transaction tx() override { return fdbTx; } + fdb::Database db() override { return fdbDb.atomic_load(); } + + fdb::Transaction tx() override { return fdbTx.atomic_load(); } // Set a continuation to be executed when a future gets ready void continueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { @@ -94,6 +116,7 @@ public: // Complete the transaction with a commit void commit() override { + ASSERT(transactional); std::unique_lock lock(mutex); if (txState != TxState::IN_PROGRESS) { return; @@ -114,31 +137,79 @@ public: } txState = TxState::DONE; lock.unlock(); + + // No need for lock from here on, because only one thread + // can enter DONE state and handle it + if (retriedErrors.size() >= LARGE_NUMBER_OF_RETRIES) { fmt::print("Transaction succeeded after {} retries on errors: {}\n", retriedErrors.size(), fmt::join(retriedErrorCodes(), ", ")); } - // cancel transaction so that any pending operations on it - // fail gracefully - fdbTx.cancel(); - txActor->complete(fdb::Error::success()); + + if (transactional) { + // cancel transaction so that any pending operations on it + // fail gracefully + fdbTx.cancel(); + } cleanUp(); - contAfterDone(); + ASSERT(txState == TxState::DONE); + contAfterDone(fdb::Error::success()); } std::string getBGBasePath() override { return bgBasePath; } + virtual void onError(fdb::Error err) override { + std::unique_lock lock(mutex); + if (txState != TxState::IN_PROGRESS) { + // Ignore further errors, if the transaction is in the error handing mode or completed + return; + } + txState = TxState::ON_ERROR; + lock.unlock(); + + // No need to hold the lock from here on, because ON_ERROR state is handled sequentially, and + // other callbacks are simply ignored while it stays in this state + + if (!canRetry(err)) { + return; + } + + ASSERT(!onErrorFuture); + + if (databaseCreateErrorInjected && canBeInjectedDatabaseCreateError(err.code())) { + // Failed to create a database because of failure injection + // Restart by recreating the transaction in a valid database + recreateAndRestartTransaction(); + } else if (transactional) { + onErrorArg = err; + onErrorFuture = tx().onError(err); + handleOnErrorFuture(); + } else if (err.retryable()) { + restartTransaction(); + } else { + transactionFailed(err); + } + } + protected: virtual void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) = 0; + virtual void handleOnErrorFuture() = 0; + // Clean up transaction state after completing the transaction // Note that the object may live longer, because it is referenced // by not yet triggered callbacks - virtual void cleanUp() { + void cleanUp() { ASSERT(txState == TxState::DONE); ASSERT(!onErrorFuture); - txActor = {}; + cancelPendingFutures(); + } + + virtual void cancelPendingFutures() {} + + bool canBeInjectedDatabaseCreateError(fdb::Error::CodeType errCode) { + return errCode == error_code_no_cluster_file_found || errCode == error_code_connection_string_invalid; } // Complete the transaction with an (unretriable) error @@ -150,9 +221,12 @@ protected: } txState = TxState::DONE; lock.unlock(); - txActor->complete(err); + + // No need for lock from here on, because only one thread + // can enter DONE state and handle it + cleanUp(); - contAfterDone(); + contAfterDone(err); } // Handle result of an a transaction onError call @@ -163,14 +237,37 @@ protected: if (err) { transactionFailed(err); } else { - std::unique_lock lock(mutex); - txState = TxState::IN_PROGRESS; - commitCalled = false; - lock.unlock(); - txActor->start(); + restartTransaction(); } } + void restartTransaction() { + ASSERT(txState == TxState::ON_ERROR); + cancelPendingFutures(); + std::unique_lock lock(mutex); + txState = TxState::IN_PROGRESS; + commitCalled = false; + lock.unlock(); + startFct(shared_from_this()); + } + + void recreateAndRestartTransaction() { + auto thisRef = std::static_pointer_cast(shared_from_this()); + scheduler->schedule([thisRef]() { + fdb::Database db = thisRef->executor->selectDatabase(); + thisRef->fdbDb.atomic_store(db); + if (thisRef->transactional) { + if (thisRef->tenantName) { + fdb::Tenant tenant = db.openTenant(*thisRef->tenantName); + thisRef->fdbTx.atomic_store(tenant.createTransaction()); + } else { + thisRef->fdbTx.atomic_store(db.createTransaction()); + } + } + thisRef->restartTransaction(); + }); + } + // Checks if a transaction can be retried. Fails the transaction if the check fails bool canRetry(fdb::Error lastErr) { ASSERT(txState == TxState::ON_ERROR); @@ -196,44 +293,77 @@ protected: return retriedErrorCodes; } + // Pointer to the transaction executor interface + // Set in contructor, stays immutable + ITransactionExecutor* const executor; + + // FDB database + // Provides a thread safe interface by itself (no need for mutex) + fdb::Database fdbDb; + // FDB transaction + // Provides a thread safe interface by itself (no need for mutex) fdb::Transaction fdbTx; - // Actor implementing the transaction worklflow - std::shared_ptr txActor; + // The function implementing the starting point of the transaction + // Set in constructor and reset on cleanup (no need for mutex) + TOpStartFct startFct; // Mutex protecting access to shared mutable state + // Only the state that is accessible unter IN_PROGRESS state + // must be protected by mutex std::mutex mutex; // Continuation to be called after completion of the transaction - TTaskFct contAfterDone; + // Set in contructor, stays immutable + const TOpContFct contAfterDone; // Reference to the scheduler - IScheduler* scheduler; + // Set in contructor, stays immutable + // Cannot be accessed in DONE state, workloads can be completed and the scheduler deleted + IScheduler* const scheduler; // Retry limit - int retryLimit; + // Set in contructor, stays immutable + const int retryLimit; // Transaction execution state + // Must be accessed under mutex TxState txState; - // onError future used in ON_ERROR state + // onError future + // used only in ON_ERROR state (no need for mutex) fdb::Future onErrorFuture; // The error code on which onError was called + // used only in ON_ERROR state (no need for mutex) fdb::Error onErrorArg; // The time point of calling onError + // used only in ON_ERROR state (no need for mutex) TimePoint onErrorCallTimePoint; // Transaction is committed or being committed + // Must be accessed under mutex bool commitCalled; // A history of errors on which the transaction was retried + // used only in ON_ERROR and DONE states (no need for mutex) std::vector retriedErrors; // blob granule base path - std::string bgBasePath; + // Set in contructor, stays immutable + const std::string bgBasePath; + + // Indicates if the database error was injected + // Accessed on initialization and in ON_ERROR state only (no need for mutex) + bool databaseCreateErrorInjected; + + // The tenant that we will run this transaction in + const std::optional tenantName; + + // Specifies whether the operation is transactional + const bool transactional; }; /** @@ -241,13 +371,16 @@ protected: */ class BlockingTransactionContext : public TransactionContextBase { public: - BlockingTransactionContext(fdb::Transaction tx, - std::shared_ptr txActor, - TTaskFct cont, + BlockingTransactionContext(ITransactionExecutor* executor, + TOpStartFct startFct, + TOpContFct cont, IScheduler* scheduler, int retryLimit, - std::string bgBasePath) - : TransactionContextBase(tx, txActor, cont, scheduler, retryLimit, bgBasePath) {} + std::string bgBasePath, + std::optional tenantName, + bool transactional) + : TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) { + } protected: void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { @@ -288,22 +421,8 @@ protected: onError(err); } - virtual void onError(fdb::Error err) override { - std::unique_lock lock(mutex); - if (txState != TxState::IN_PROGRESS) { - // Ignore further errors, if the transaction is in the error handing mode or completed - return; - } - txState = TxState::ON_ERROR; - lock.unlock(); - - if (!canRetry(err)) { - return; - } - - ASSERT(!onErrorFuture); - onErrorFuture = fdbTx.onError(err); - onErrorArg = err; + virtual void handleOnErrorFuture() override { + ASSERT(txState == TxState::ON_ERROR); auto start = timeNow(); fdb::Error err2 = onErrorFuture.blockUntilReady(); @@ -330,13 +449,16 @@ protected: */ class AsyncTransactionContext : public TransactionContextBase { public: - AsyncTransactionContext(fdb::Transaction tx, - std::shared_ptr txActor, - TTaskFct cont, + AsyncTransactionContext(ITransactionExecutor* executor, + TOpStartFct startFct, + TOpContFct cont, IScheduler* scheduler, int retryLimit, - std::string bgBasePath) - : TransactionContextBase(tx, txActor, cont, scheduler, retryLimit, bgBasePath) {} + std::string bgBasePath, + std::optional tenantName, + bool transactional) + : TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) { + } protected: void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { @@ -344,7 +466,7 @@ protected: if (txState != TxState::IN_PROGRESS) { return; } - callbackMap[f] = CallbackInfo{ f, cont, shared_from_this(), retryOnError, timeNow() }; + callbackMap[f] = CallbackInfo{ f, cont, shared_from_this(), retryOnError, timeNow(), false }; lock.unlock(); try { f.then([this](fdb::Future f) { futureReadyCallback(f, this); }); @@ -383,7 +505,6 @@ protected: if (txState != TxState::IN_PROGRESS) { return; } - lock.unlock(); fdb::Error err = f.error(); auto waitTimeUs = timeElapsedInUs(cbInfo.startTime, endTime); if (waitTimeUs > LONG_WAIT_TIME_US) { @@ -392,32 +513,23 @@ protected: err.code(), err.what()); } - if (err.code() == error_code_transaction_cancelled) { + if (err.code() == error_code_transaction_cancelled || cbInfo.cancelled) { return; } if (err.code() == error_code_success || !cbInfo.retryOnError) { scheduler->schedule(cbInfo.cont); return; } + // We keep lock until here to prevent transitions from the IN_PROGRESS state + // which could possibly lead to completion of the workload and destruction + // of the scheduler + lock.unlock(); onError(err); } - virtual void onError(fdb::Error err) override { - std::unique_lock lock(mutex); - if (txState != TxState::IN_PROGRESS) { - // Ignore further errors, if the transaction is in the error handing mode or completed - return; - } - txState = TxState::ON_ERROR; - lock.unlock(); + virtual void handleOnErrorFuture() override { + ASSERT(txState == TxState::ON_ERROR); - if (!canRetry(err)) { - return; - } - - ASSERT(!onErrorFuture); - onErrorArg = err; - onErrorFuture = tx().onError(err); onErrorCallTimePoint = timeNow(); onErrorThisRef = std::static_pointer_cast(shared_from_this()); try { @@ -457,17 +569,17 @@ protected: scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); }); } - void cleanUp() override { - TransactionContextBase::cleanUp(); - + void cancelPendingFutures() override { // Cancel all pending operations // Note that the callbacks of the cancelled futures will still be called std::unique_lock lock(mutex); std::vector futures; for (auto& iter : callbackMap) { + iter.second.cancelled = true; futures.push_back(iter.second.future); } lock.unlock(); + for (auto& f : futures) { f.cancel(); } @@ -487,12 +599,16 @@ protected: std::shared_ptr thisRef; bool retryOnError; TimePoint startTime; + bool cancelled; }; // Map for keeping track of future waits and holding necessary object references + // It can be accessed at any time when callbacks are triggered, so it mus always + // be mutex protected std::unordered_map callbackMap; // Holding reference to this for onError future C callback + // Accessed only in ON_ERROR state (no need for mutex) std::shared_ptr onErrorThisRef; }; @@ -503,30 +619,98 @@ class TransactionExecutorBase : public ITransactionExecutor { public: TransactionExecutorBase(const TransactionExecutorOptions& options) : options(options), scheduler(nullptr) {} + ~TransactionExecutorBase() { + if (tamperClusterFileThread.joinable()) { + tamperClusterFileThread.join(); + } + } + void init(IScheduler* scheduler, const char* clusterFile, const std::string& bgBasePath) override { this->scheduler = scheduler; this->clusterFile = clusterFile; this->bgBasePath = bgBasePath; + + ASSERT(!options.tmpDir.empty()); + emptyClusterFile.create(options.tmpDir, "fdbempty.cluster"); + invalidClusterFile.create(options.tmpDir, "fdbinvalid.cluster"); + invalidClusterFile.write(Random().get().randomStringLowerCase(1, 100)); + + emptyListClusterFile.create(options.tmpDir, "fdbemptylist.cluster"); + emptyListClusterFile.write(fmt::format("{}:{}@", + Random().get().randomStringLowerCase(3, 8), + Random().get().randomStringLowerCase(1, 100))); + + if (options.tamperClusterFile) { + tamperedClusterFile.create(options.tmpDir, "fdb.cluster"); + originalClusterFile = clusterFile; + this->clusterFile = tamperedClusterFile.getFileName(); + + // begin with a valid cluster file, but with non existing address + tamperedClusterFile.write(fmt::format("{}:{}@192.168.{}.{}:{}", + Random().get().randomStringLowerCase(3, 8), + Random().get().randomStringLowerCase(1, 100), + Random().get().randomInt(1, 254), + Random().get().randomInt(1, 254), + Random().get().randomInt(2000, 10000))); + + tamperClusterFileThread = std::thread([this]() { + std::this_thread::sleep_for(std::chrono::seconds(2)); + // now write an invalid connection string + tamperedClusterFile.write(fmt::format("{}:{}@", + Random().get().randomStringLowerCase(3, 8), + Random().get().randomStringLowerCase(1, 100))); + std::this_thread::sleep_for(std::chrono::seconds(2)); + // finally use correct cluster file contents + std::filesystem::copy_file(std::filesystem::path(originalClusterFile), + std::filesystem::path(tamperedClusterFile.getFileName()), + std::filesystem::copy_options::overwrite_existing); + }); + } } -protected: - // Execute the transaction on the given database instance - void executeOnDatabase(fdb::Database db, std::shared_ptr txActor, TTaskFct cont) { + const TransactionExecutorOptions& getOptions() override { return options; } + + void execute(TOpStartFct startFct, + TOpContFct cont, + std::optional tenantName, + bool transactional) override { try { - fdb::Transaction tx = db.createTransaction(); std::shared_ptr ctx; if (options.blockOnFutures) { - ctx = std::make_shared( - tx, txActor, cont, scheduler, options.transactionRetryLimit, bgBasePath); + ctx = std::make_shared(this, + startFct, + cont, + scheduler, + options.transactionRetryLimit, + bgBasePath, + tenantName, + transactional); } else { - ctx = std::make_shared( - tx, txActor, cont, scheduler, options.transactionRetryLimit, bgBasePath); + ctx = std::make_shared(this, + startFct, + cont, + scheduler, + options.transactionRetryLimit, + bgBasePath, + tenantName, + transactional); } - txActor->init(ctx); - txActor->start(); + startFct(ctx); } catch (...) { - txActor->complete(fdb::Error(error_code_operation_failed)); - cont(); + cont(fdb::Error(error_code_operation_failed)); + } + } + + std::string getClusterFileForErrorInjection() override { + switch (Random::get().randomInt(0, 3)) { + case 0: + return fmt::format("{}{}", "not-existing-file", Random::get().randomStringLowerCase(0, 2)); + case 1: + return emptyClusterFile.getFileName(); + case 2: + return invalidClusterFile.getFileName(); + default: // case 3 + return emptyListClusterFile.getFileName(); } } @@ -535,6 +719,12 @@ protected: std::string bgBasePath; std::string clusterFile; IScheduler* scheduler; + TmpFile emptyClusterFile; + TmpFile invalidClusterFile; + TmpFile emptyListClusterFile; + TmpFile tamperedClusterFile; + std::thread tamperClusterFileThread; + std::string originalClusterFile; }; /** @@ -549,19 +739,19 @@ public: void init(IScheduler* scheduler, const char* clusterFile, const std::string& bgBasePath) override { TransactionExecutorBase::init(scheduler, clusterFile, bgBasePath); for (int i = 0; i < options.numDatabases; i++) { - fdb::Database db(clusterFile); + fdb::Database db(this->clusterFile); databases.push_back(db); } } - void execute(std::shared_ptr txActor, TTaskFct cont) override { + fdb::Database selectDatabase() override { int idx = Random::get().randomInt(0, options.numDatabases - 1); - executeOnDatabase(databases[idx], txActor, cont); + return databases[idx]; } +private: void release() { databases.clear(); } -private: std::vector databases; }; @@ -572,10 +762,7 @@ class DBPerTransactionExecutor : public TransactionExecutorBase { public: DBPerTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {} - void execute(std::shared_ptr txActor, TTaskFct cont) override { - fdb::Database db(clusterFile.c_str()); - executeOnDatabase(db, txActor, cont); - } + fdb::Database selectDatabase() override { return fdb::Database(clusterFile.c_str()); } }; std::unique_ptr createTransactionExecutor(const TransactionExecutorOptions& options) { diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 31f6f3bc84..b0e5268d14 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -38,6 +38,9 @@ class ITransactionContext : public std::enable_shared_from_this futures, TTaskFct cont); }; -/** - * Interface of an actor object implementing a concrete transaction - */ -class ITransactionActor { -public: - virtual ~ITransactionActor() {} +// Type of the lambda functions implementing a database operation +using TOpStartFct = std::function)>; - // Initialize with the given transaction context - virtual void init(std::shared_ptr ctx) = 0; - - // Start execution of the transaction, also called on retries - virtual void start() = 0; - - // Transaction completion result (error_code_success in case of success) - virtual fdb::Error getError() = 0; - - // Notification about the completion of the transaction - virtual void complete(fdb::Error err) = 0; -}; - -/** - * A helper base class for transaction actors - */ -class TransactionActorBase : public ITransactionActor { -public: - void init(std::shared_ptr ctx) override { context = ctx; } - fdb::Error getError() override { return error; } - void complete(fdb::Error err) override; - -protected: - std::shared_ptr ctx() { return context; } - -private: - std::shared_ptr context; - fdb::Error error = fdb::Error::success(); -}; - -// Type of the lambda functions implementing a transaction -using TTxStartFct = std::function)>; - -/** - * A wrapper class for transactions implemented by lambda functions - */ -class TransactionFct : public TransactionActorBase { -public: - TransactionFct(TTxStartFct startFct) : startFct(startFct) {} - void start() override { startFct(this->ctx()); } - -private: - TTxStartFct startFct; -}; +// Type of the lambda functions implementing a database operation +using TOpContFct = std::function; /** * Configuration of transaction execution mode @@ -124,11 +81,27 @@ struct TransactionExecutorOptions { // Create each transaction in a separate database instance bool databasePerTransaction = false; + // Enable injection of database create errors + bool injectDatabaseCreateErrors = false; + + // Test tampering cluster file contents + bool tamperClusterFile = false; + + // The probability of injected database create errors + // Used if injectDatabaseCreateErrors = true + double databaseCreateErrorRatio = 0.1; + // The size of the database instance pool int numDatabases = 1; + // The number of tenants to create in the cluster. If 0, no tenants are used. + int numTenants = 0; + // Maximum number of retries per transaction (0 - unlimited) int transactionRetryLimit = 0; + + // Temporary directory + std::string tmpDir; }; /** @@ -140,7 +113,13 @@ class ITransactionExecutor { public: virtual ~ITransactionExecutor() {} virtual void init(IScheduler* sched, const char* clusterFile, const std::string& bgBasePath) = 0; - virtual void execute(std::shared_ptr tx, TTaskFct cont) = 0; + virtual void execute(TOpStartFct start, + TOpContFct cont, + std::optional tenantName, + bool transactional) = 0; + virtual fdb::Database selectDatabase() = 0; + virtual std::string getClusterFileForErrorInjection() = 0; + virtual const TransactionExecutorOptions& getOptions() = 0; }; // Create a transaction executor for the given options diff --git a/bindings/c/test/apitester/TesterUtil.cpp b/bindings/c/test/apitester/TesterUtil.cpp index 0e19081180..6ec9f76f04 100644 --- a/bindings/c/test/apitester/TesterUtil.cpp +++ b/bindings/c/test/apitester/TesterUtil.cpp @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include namespace FdbApiTester { @@ -46,16 +49,6 @@ Random& Random::get() { return random; } -fdb::ByteString Random::randomStringLowerCase(int minLength, int maxLength) { - int length = randomInt(minLength, maxLength); - fdb::ByteString str; - str.reserve(length); - for (int i = 0; i < length; i++) { - str += (char)randomInt('a', 'z'); - } - return str; -} - bool Random::randomBool(double trueRatio) { return std::uniform_real_distribution(0.0, 1.0)(random) <= trueRatio; } @@ -106,4 +99,52 @@ KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array) { return out; }; +GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefArray::Type array) { + auto& [in_summaries, in_count] = array; + + GranuleSummaryArray out; + + for (int i = 0; i < in_count; ++i) { + fdb::native::FDBGranuleSummary nativeSummary = *in_summaries++; + fdb::GranuleSummary summary(nativeSummary); + out.push_back(summary); + } + return out; +}; + +TmpFile::~TmpFile() { + if (!filename.empty()) { + remove(); + } +} + +void TmpFile::create(std::string_view dir, std::string_view prefix) { + while (true) { + filename = fmt::format("{}/{}-{}", dir, prefix, Random::get().randomStringLowerCase(6, 6)); + if (!std::filesystem::exists(std::filesystem::path(filename))) { + break; + } + } + + // Create an empty tmp file + std::fstream tmpFile(filename, std::fstream::out); + if (!tmpFile.good()) { + throw TesterError(fmt::format("Failed to create temporary file {}\n", filename)); + } +} + +void TmpFile::write(std::string_view data) { + std::ofstream ofs(filename, std::fstream::out | std::fstream::binary); + if (!ofs.good()) { + throw TesterError(fmt::format("Failed to write to the temporary file {}\n", filename)); + } + ofs.write(data.data(), data.size()); +} + +void TmpFile::remove() { + if (!std::filesystem::remove(std::filesystem::path(filename))) { + fmt::print(stderr, "Failed to remove file {}\n", filename); + } +} + } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h index de5e5c8990..1ace2c9721 100644 --- a/bindings/c/test/apitester/TesterUtil.h +++ b/bindings/c/test/apitester/TesterUtil.h @@ -66,7 +66,20 @@ public: int randomInt(int min, int max); - fdb::ByteString randomStringLowerCase(int minLength, int maxLength); + template + StringType randomStringLowerCase(int minLength, int maxLength) { + int length = randomInt(minLength, maxLength); + StringType str; + str.reserve(length); + for (int i = 0; i < length; i++) { + str += (char)randomInt('a', 'z'); + } + return str; + } + + fdb::ByteString randomByteStringLowerCase(int minLength, int maxLength) { + return randomStringLowerCase(minLength, maxLength); + } bool randomBool(double trueRatio); @@ -120,6 +133,9 @@ KeyValueArray copyKeyValueArray(fdb::future_var::KeyValueRefArray::Type array); using KeyRangeArray = std::vector; KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array); +using GranuleSummaryArray = std::vector; +GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefArray::Type array); + static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems"); // Converts a little-endian encoded number into an integral type. @@ -139,6 +155,19 @@ static fdb::ByteString toByteString(T value) { return output; } +// Creates a temporary file; file gets destroyed/deleted along with object destruction. +struct TmpFile { +public: + ~TmpFile(); + void create(std::string_view dir, std::string_view prefix); + void write(std::string_view data); + void remove(); + const std::string& getFileName() const { return filename; } + +private: + std::string filename; +}; + } // namespace FdbApiTester #endif diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index 6cdfacc423..8e7289f437 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -80,13 +80,14 @@ bool WorkloadConfig::getBoolOption(const std::string& name, bool defaultVal) con WorkloadBase::WorkloadBase(const WorkloadConfig& config) : manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients), - failed(false), numTxCompleted(0) { + failed(false), numTxCompleted(0), numTxStarted(0), inProgress(false) { maxErrors = config.getIntOption("maxErrors", 10); workloadId = fmt::format("{}{}", config.name, clientId); } void WorkloadBase::init(WorkloadManager* manager) { this->manager = manager; + inProgress = true; } void WorkloadBase::printStats() { @@ -94,6 +95,7 @@ void WorkloadBase::printStats() { } void WorkloadBase::schedule(TTaskFct task) { + ASSERT(inProgress); if (failed) { return; } @@ -104,28 +106,49 @@ void WorkloadBase::schedule(TTaskFct task) { }); } -void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError) { +void WorkloadBase::execTransaction(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant, + bool failOnError) { + doExecute(startFct, cont, tenant, failOnError, true); +} + +// Execute a non-transactional database operation within the workload +void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) { + doExecute(startFct, cont, {}, failOnError, false); +} + +void WorkloadBase::doExecute(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant, + bool failOnError, + bool transactional) { + ASSERT(inProgress); if (failed) { return; } tasksScheduled++; - manager->txExecutor->execute(tx, [this, tx, cont, failOnError]() { - numTxCompleted++; - fdb::Error err = tx->getError(); - if (err.code() == error_code_success) { - cont(); - } else { - std::string msg = fmt::format("Transaction failed with error: {} ({})", err.code(), err.what()); - if (failOnError) { - error(msg); - failed = true; - } else { - info(msg); - cont(); - } - } - scheduledTaskDone(); - }); + numTxStarted++; + manager->txExecutor->execute( + startFct, + [this, startFct, cont, failOnError](fdb::Error err) { + numTxCompleted++; + if (err.code() == error_code_success) { + cont(); + } else { + std::string msg = fmt::format("Transaction failed with error: {} ({})", err.code(), err.what()); + if (failOnError) { + error(msg); + failed = true; + } else { + info(msg); + cont(); + } + } + scheduledTaskDone(); + }, + tenant, + transactional); } void WorkloadBase::info(const std::string& msg) { @@ -143,11 +166,13 @@ void WorkloadBase::error(const std::string& msg) { void WorkloadBase::scheduledTaskDone() { if (--tasksScheduled == 0) { + inProgress = false; if (numErrors > 0) { error(fmt::format("Workload failed with {} errors", numErrors.load())); } else { info("Workload successfully completed"); } + ASSERT(numTxStarted == numTxCompleted); manager->workloadDone(this, numErrors > 0); } } diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index beb3082c5c..ea1c6816f9 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -82,6 +82,9 @@ struct WorkloadConfig { // Total number of clients int numClients; + // Number of Tenants + int numTenants; + // Selected FDB API version int apiVersion; @@ -116,12 +119,13 @@ protected: void schedule(TTaskFct task); // Execute a transaction within the workload - void execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError = true); + void execTransaction(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant = std::optional(), + bool failOnError = true); - // Execute a transaction within the workload, a convenience method for a tranasaction defined by a lambda function - void execTransaction(TTxStartFct start, TTaskFct cont, bool failOnError = true) { - execTransaction(std::make_shared(start), cont, failOnError); - } + // Execute a non-transactional database operation within the workload + void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true); // Log an error message, increase error counter void error(const std::string& msg); @@ -135,6 +139,12 @@ protected: private: WorkloadManager* manager; + void doExecute(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant, + bool failOnError, + bool transactional); + // Decrease scheduled task counter, notify the workload manager // that the task is done if no more tasks schedule void scheduledTaskDone(); @@ -164,6 +174,12 @@ protected: // Number of completed transactions std::atomic numTxCompleted; + + // Number of started transactions + std::atomic numTxStarted; + + // Workload is in progress (intialized, but not completed) + std::atomic inProgress; }; // Workload manager diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml new file mode 100644 index 0000000000..788bd04d85 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml @@ -0,0 +1,22 @@ +[[test]] +title = 'Blob Granule Errors Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml new file mode 100644 index 0000000000..788bd04d85 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml @@ -0,0 +1,22 @@ +[[test]] +title = 'Blob Granule Errors Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml new file mode 100644 index 0000000000..85e78975f6 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml @@ -0,0 +1,15 @@ +[[test]] +title = 'Blob Granule Errors Single Threaded' +minClients = 1 +maxClients = 3 +multiThreaded = false + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 310ebd9b83..1d79dd754c 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -36,6 +36,8 @@ namespace FdbApiTester { namespace { +#define API_VERSION_CLIENT_TMP_DIR 720 + enum TesterOptionId { OPT_CONNFILE, OPT_HELP, @@ -285,7 +287,7 @@ void fdb_check(fdb::Error e) { } void applyNetworkOptions(TesterOptions& options) { - if (!options.tmpDir.empty() && options.apiVersion >= 720) { + if (!options.tmpDir.empty() && options.apiVersion >= API_VERSION_CLIENT_TMP_DIR) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_TMP_DIR, options.tmpDir); } if (!options.externalClientLibrary.empty()) { @@ -320,6 +322,10 @@ void applyNetworkOptions(TesterOptions& options) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE); } + if (options.testSpec.disableClientBypass && options.apiVersion >= 720) { + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_CLIENT_BYPASS); + } + if (options.trace) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir); fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, options.traceFormat); @@ -350,6 +356,12 @@ void randomizeOptions(TesterOptions& options) { options.numClientThreads = random.randomInt(options.testSpec.minClientThreads, options.testSpec.maxClientThreads); options.numDatabases = random.randomInt(options.testSpec.minDatabases, options.testSpec.maxDatabases); options.numClients = random.randomInt(options.testSpec.minClients, options.testSpec.maxClients); + + // Choose a random number of tenants. If a test is configured to allow 0 tenants, then use 0 tenants half the time. + if (options.testSpec.maxTenants >= options.testSpec.minTenants && + (options.testSpec.minTenants > 0 || random.randomBool(0.5))) { + options.numTenants = random.randomInt(options.testSpec.minTenants, options.testSpec.maxTenants); + } } bool runWorkloads(TesterOptions& options) { @@ -358,7 +370,12 @@ bool runWorkloads(TesterOptions& options) { txExecOptions.blockOnFutures = options.testSpec.blockOnFutures; txExecOptions.numDatabases = options.numDatabases; txExecOptions.databasePerTransaction = options.testSpec.databasePerTransaction; + // 7.1 and older releases crash on database create errors + txExecOptions.injectDatabaseCreateErrors = options.testSpec.buggify && options.apiVersion > 710; txExecOptions.transactionRetryLimit = options.transactionRetryLimit; + txExecOptions.tmpDir = options.tmpDir.empty() ? std::string("/tmp") : options.tmpDir; + txExecOptions.tamperClusterFile = options.testSpec.tamperClusterFile; + txExecOptions.numTenants = options.numTenants; std::vector> workloads; workloads.reserve(options.testSpec.workloads.size() * options.numClients); @@ -370,6 +387,7 @@ bool runWorkloads(TesterOptions& options) { config.options = workloadSpec.options; config.clientId = i; config.numClients = options.numClients; + config.numTenants = options.numTenants; config.apiVersion = options.apiVersion; std::shared_ptr workload = IWorkloadFactory::create(workloadSpec.name, config); if (!workload) { diff --git a/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml new file mode 100644 index 0000000000..9e6fc350ea --- /dev/null +++ b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml @@ -0,0 +1,29 @@ +[[test]] +title = 'API Correctness Single Threaded' +minClients = 1 +maxClients = 3 +minDatabases = 1 +maxDatabases = 3 +multiThreaded = false +disableClientBypass = true + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 + + [[test.workload]] + name = 'WatchAndWait' + initialSize = 0 + numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml new file mode 100644 index 0000000000..2a5a0d30e1 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml @@ -0,0 +1,21 @@ +[[test]] +title = 'Multi-tenant API Correctness Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minClients = 2 +maxClients = 8 +minTenants = 2 +maxTenants = 5 + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 5 + initialSize = 100 + numRandomOperations = 200 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml new file mode 100644 index 0000000000..60a9715bd8 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml @@ -0,0 +1,24 @@ +[[test]] +title = 'Test tampering the cluster file' +multiThreaded = true +buggify = true +tamperClusterFile = true +minFdbThreads = 2 +maxFdbThreads = 4 +minDatabases = 2 +maxDatabases = 4 +minClientThreads = 2 +maxClientThreads = 4 +minClients = 2 +maxClients = 4 + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/client_memory_test.cpp b/bindings/c/test/client_memory_test.cpp index c6cff85574..3ea2f74a8a 100644 --- a/bindings/c/test/client_memory_test.cpp +++ b/bindings/c/test/client_memory_test.cpp @@ -44,7 +44,7 @@ int main(int argc, char** argv) { if (argc != 2) { printf("Usage: %s ", argv[0]); } - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); fdb_check(fdb_setup_network()); std::thread network_thread{ &fdb_run_network }; diff --git a/bindings/c/test/fdb_api.hpp b/bindings/c/test/fdb_api.hpp index 6d0db008a2..7074c74b18 100644 --- a/bindings/c/test/fdb_api.hpp +++ b/bindings/c/test/fdb_api.hpp @@ -46,6 +46,8 @@ namespace native { #include } +#define TENANT_API_VERSION_GUARD 720 + using ByteString = std::basic_string; using BytesRef = std::basic_string_view; using CharsRef = std::string_view; @@ -62,6 +64,22 @@ struct KeyRange { Key beginKey; Key endKey; }; +struct GranuleSummary { + KeyRange keyRange; + int64_t snapshotVersion; + int64_t snapshotSize; + int64_t deltaVersion; + int64_t deltaSize; + + GranuleSummary(const native::FDBGranuleSummary& nativeSummary) { + keyRange.beginKey = fdb::Key(nativeSummary.key_range.begin_key, nativeSummary.key_range.begin_key_length); + keyRange.endKey = fdb::Key(nativeSummary.key_range.end_key, nativeSummary.key_range.end_key_length); + snapshotVersion = nativeSummary.snapshot_version; + snapshotSize = nativeSummary.snapshot_size; + deltaVersion = nativeSummary.delta_version; + deltaSize = nativeSummary.delta_size; + } +}; inline uint8_t const* toBytePtr(char const* ptr) noexcept { return reinterpret_cast(ptr); @@ -137,6 +155,13 @@ struct None { struct Type {}; static Error extract(native::FDBFuture*, Type&) noexcept { return Error(0); } }; +struct Bool { + using Type = native::fdb_bool_t; + static Error extract(native::FDBFuture* f, Type& out) noexcept { + auto err = native::fdb_future_get_bool(f, &out); + return Error(err); + } +}; struct Int64 { using Type = int64_t; static Error extract(native::FDBFuture* f, Type& out) noexcept { @@ -200,6 +225,27 @@ struct KeyRangeRefArray { } }; +struct GranuleSummaryRef : native::FDBGranuleSummary { + fdb::KeyRef beginKey() const noexcept { + return fdb::KeyRef(native::FDBGranuleSummary::key_range.begin_key, + native::FDBGranuleSummary::key_range.begin_key_length); + } + fdb::KeyRef endKey() const noexcept { + return fdb::KeyRef(native::FDBGranuleSummary::key_range.end_key, + native::FDBGranuleSummary::key_range.end_key_length); + } +}; + +struct GranuleSummaryRefArray { + using Type = std::tuple; + static Error extract(native::FDBFuture* f, Type& out) noexcept { + auto& [out_summaries, out_count] = out; + auto err = native::fdb_future_get_granule_summary_array( + f, reinterpret_cast(&out_summaries), &out_count); + return Error(err); + } +}; + } // namespace future_var [[noreturn]] inline void throwError(std::string_view preamble, Error err) { @@ -310,6 +356,7 @@ public: class Future { protected: friend class Transaction; + friend class Database; friend std::hash; std::shared_ptr f; @@ -468,6 +515,14 @@ public: Transaction(const Transaction&) noexcept = default; Transaction& operator=(const Transaction&) noexcept = default; + void atomic_store(Transaction other) { std::atomic_store(&tr, other.tr); } + + Transaction atomic_load() { + Transaction retVal; + retVal.tr = std::atomic_load(&tr); + return retVal; + } + bool valid() const noexcept { return tr != nullptr; } explicit operator bool() const noexcept { return valid(); } @@ -573,6 +628,14 @@ public: tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), begin_version, read_version, context)); } + TypedFuture summarizeBlobGranules(KeyRef begin, + KeyRef end, + int64_t summaryVersion, + int rangeLimit) { + return native::fdb_transaction_summarize_blob_granules( + tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), summaryVersion, rangeLimit); + } + TypedFuture watch(KeyRef key) { return native::fdb_transaction_watch(tr.get(), key.data(), intSize(key)); } @@ -599,6 +662,13 @@ public: void clearRange(KeyRef begin, KeyRef end) { native::fdb_transaction_clear_range(tr.get(), begin.data(), intSize(begin), end.data(), intSize(end)); } + + void addReadConflictRange(KeyRef begin, KeyRef end) { + if (auto err = Error(native::fdb_transaction_add_conflict_range( + tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), FDB_CONFLICT_RANGE_TYPE_READ))) { + throwError("fdb_transaction_add_conflict_range returned error: ", err); + } + } }; class Tenant final { @@ -621,6 +691,7 @@ public: static void createTenant(Transaction tr, BytesRef name) { tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef()); tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef()); + tr.setOption(FDBTransactionOption::FDB_TR_OPTION_RAW_ACCESS, BytesRef()); tr.set(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), BytesRef()); } @@ -662,6 +733,14 @@ public: } Database() noexcept : db(nullptr) {} + void atomic_store(Database other) { std::atomic_store(&db, other.db); } + + Database atomic_load() { + Database retVal; + retVal.db = std::atomic_load(&db); + return retVal; + } + Error setOptionNothrow(FDBDatabaseOption option, int64_t value) noexcept { return Error(native::fdb_database_set_option( db.get(), option, reinterpret_cast(&value), static_cast(sizeof(value)))); @@ -707,10 +786,50 @@ public: throwError("Failed to create transaction: ", err); return Transaction(tx_native); } + + TypedFuture listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) { + if (!db) + throw std::runtime_error("listBlobbifiedRanges from null database"); + return native::fdb_database_list_blobbified_ranges( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit); + } + + TypedFuture verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) { + if (!db) + throw std::runtime_error("verifyBlobRange from null database"); + return native::fdb_database_verify_blob_range( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version); + } + + TypedFuture blobbifyRange(KeyRef begin, KeyRef end) { + if (!db) + throw std::runtime_error("blobbifyRange from null database"); + return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + } + + TypedFuture unblobbifyRange(KeyRef begin, KeyRef end) { + if (!db) + throw std::runtime_error("unblobbifyRange from null database"); + return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + } + + TypedFuture purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) { + if (!db) + throw std::runtime_error("purgeBlobGranules from null database"); + native::fdb_bool_t forceBool = force; + return native::fdb_database_purge_blob_granules( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool); + } + + TypedFuture waitPurgeGranulesComplete(KeyRef purgeKey) { + if (!db) + throw std::runtime_error("purgeBlobGranules from null database"); + return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey)); + } }; inline Error selectApiVersionNothrow(int version) { - if (version < 720) { + if (version < TENANT_API_VERSION_GUARD) { Tenant::tenantManagementMapPrefix = "\xff\xff/management/tenant_map/"; } return Error(native::fdb_select_api_version(version)); @@ -723,7 +842,7 @@ inline void selectApiVersion(int version) { } inline Error selectApiVersionCappedNothrow(int version) { - if (version < 720) { + if (version < TENANT_API_VERSION_GUARD) { Tenant::tenantManagementMapPrefix = "\xff\xff/management/tenant_map/"; } return Error( diff --git a/bindings/c/test/fdb_c90_test.c b/bindings/c/test/fdb_c90_test.c index e2011286ed..1bcdf63284 100644 --- a/bindings/c/test/fdb_c90_test.c +++ b/bindings/c/test/fdb_c90_test.c @@ -4,6 +4,6 @@ int main(int argc, char* argv[]) { (void)argc; (void)argv; - fdb_select_api_version(720); + fdb_select_api_version(FDB_API_VERSION); return 0; } diff --git a/bindings/c/test/mako/blob_granules.cpp b/bindings/c/test/mako/blob_granules.cpp index af805f2e56..1071737211 100644 --- a/bindings/c/test/mako/blob_granules.cpp +++ b/bindings/c/test/mako/blob_granules.cpp @@ -26,6 +26,9 @@ extern thread_local mako::Logger logr; +// FIXME: use the same implementation as the api tester! this implementation was from back when mako was written in C +// and is inferior. + namespace mako::blob_granules::local_file { int64_t startLoad(const char* filename, diff --git a/bindings/c/test/performance_test.c b/bindings/c/test/performance_test.c index 5cd9f64bc0..ab90395e05 100644 --- a/bindings/c/test/performance_test.c +++ b/bindings/c/test/performance_test.c @@ -641,7 +641,7 @@ void runTests(struct ResultSet* rs) { int main(int argc, char** argv) { srand(time(NULL)); struct ResultSet* rs = newResultSet(); - checkError(fdb_select_api_version(720), "select API version", rs); + checkError(fdb_select_api_version(FDB_API_VERSION), "select API version", rs); printf("Running performance test at client version: %s\n", fdb_get_client_version()); valueStr = (uint8_t*)malloc((sizeof(uint8_t)) * valueSize); diff --git a/bindings/c/test/ryw_benchmark.c b/bindings/c/test/ryw_benchmark.c index cf2754bcec..731a2ce0a2 100644 --- a/bindings/c/test/ryw_benchmark.c +++ b/bindings/c/test/ryw_benchmark.c @@ -285,7 +285,7 @@ void runTests(struct ResultSet* rs) { int main(int argc, char** argv) { srand(time(NULL)); struct ResultSet* rs = newResultSet(); - checkError(fdb_select_api_version(720), "select API version", rs); + checkError(fdb_select_api_version(FDB_API_VERSION), "select API version", rs); printf("Running RYW Benchmark test at client version: %s\n", fdb_get_client_version()); keys = generateKeys(numKeys, keySize); diff --git a/bindings/c/test/txn_size_test.c b/bindings/c/test/txn_size_test.c index 97081f24a6..57c74a9bca 100644 --- a/bindings/c/test/txn_size_test.c +++ b/bindings/c/test/txn_size_test.c @@ -97,7 +97,7 @@ void runTests(struct ResultSet* rs) { int main(int argc, char** argv) { srand(time(NULL)); struct ResultSet* rs = newResultSet(); - checkError(fdb_select_api_version(720), "select API version", rs); + checkError(fdb_select_api_version(FDB_API_VERSION), "select API version", rs); printf("Running performance test at client version: %s\n", fdb_get_client_version()); keys = generateKeys(numKeys, KEY_SIZE); diff --git a/bindings/c/test/unit/disconnected_timeout_tests.cpp b/bindings/c/test/unit/disconnected_timeout_tests.cpp index b1c6b72730..7d006faa23 100644 --- a/bindings/c/test/unit/disconnected_timeout_tests.cpp +++ b/bindings/c/test/unit/disconnected_timeout_tests.cpp @@ -255,7 +255,7 @@ int main(int argc, char** argv) { << std::endl; return 1; } - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); if (argc >= 3) { std::string externalClientLibrary = argv[2]; if (externalClientLibrary.substr(0, 2) != "--") { diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index d454082af3..1376fc77c7 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -84,6 +84,12 @@ void Future::cancel() { return fdb_future_get_keyrange_array(future_, out_keyranges, out_count); } +// GranuleSummaryArrayFuture + +[[nodiscard]] fdb_error_t GranuleSummaryArrayFuture::get(const FDBGranuleSummary** out_summaries, int* out_count) { + return fdb_future_get_granule_summary_array(future_, out_summaries, out_count); +} + // KeyValueArrayFuture [[nodiscard]] fdb_error_t KeyValueArrayFuture::get(const FDBKeyValue** out_kv, int* out_count, fdb_bool_t* out_more) { @@ -366,6 +372,7 @@ KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_ end_key.size(), rangeLimit)); } + KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key, std::string_view end_key, int64_t beginVersion, @@ -381,4 +388,17 @@ KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key, granuleContext)); } +GranuleSummaryArrayFuture Transaction::summarize_blob_granules(std::string_view begin_key, + std::string_view end_key, + int64_t summary_version, + int rangeLimit) { + return GranuleSummaryArrayFuture(fdb_transaction_summarize_blob_granules(tr_, + (const uint8_t*)begin_key.data(), + begin_key.size(), + (const uint8_t*)end_key.data(), + end_key.size(), + summary_version, + rangeLimit)); +} + } // namespace fdb diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index d0c4abd8db..137083a90c 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -161,6 +161,18 @@ private: KeyRangeArrayFuture(FDBFuture* f) : Future(f) {} }; +class GranuleSummaryArrayFuture : public Future { +public: + // Call this function instead of fdb_future_get_granule_summary_array when using + // the GranuleSummaryArrayFuture type. It's behavior is identical to + // fdb_future_get_granule_summary_array. + fdb_error_t get(const FDBGranuleSummary** out_summaries, int* out_count); + +private: + friend class Transaction; + GranuleSummaryArrayFuture(FDBFuture* f) : Future(f) {} +}; + class EmptyFuture : public Future { private: friend class Transaction; @@ -354,6 +366,10 @@ public: int64_t beginVersion, int64_t endVersion, FDBReadBlobGranuleContext granule_context); + GranuleSummaryArrayFuture summarize_blob_granules(std::string_view begin_key, + std::string_view end_key, + int64_t summaryVersion, + int rangeLimit); private: FDBTransaction* tr_; diff --git a/bindings/c/test/unit/setup_tests.cpp b/bindings/c/test/unit/setup_tests.cpp index 6ac65b7850..2e96eb00b9 100644 --- a/bindings/c/test/unit/setup_tests.cpp +++ b/bindings/c/test/unit/setup_tests.cpp @@ -42,13 +42,13 @@ TEST_CASE("setup") { CHECK(err); // Select current API version - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); // Error to call again after a successful return - err = fdb_select_api_version(720); + err = fdb_select_api_version(FDB_API_VERSION); CHECK(err); - CHECK(fdb_get_max_api_version() >= 720); + CHECK(fdb_get_max_api_version() >= FDB_API_VERSION); fdb_check(fdb_setup_network()); // Calling a second time should fail diff --git a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp index 810cc066fd..73dc8132a5 100644 --- a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp +++ b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp @@ -53,7 +53,7 @@ bool file_exists(const char* path) { } int main(int argc, char** argv) { - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); std::string file_identifier = "trace_partial_file_suffix_test" + std::to_string(std::random_device{}()); std::string trace_partial_file_suffix = ".tmp"; diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 2ab80cf90c..a45221f606 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -1001,7 +1001,7 @@ GetMappedRangeResult getMappedIndexEntries(int beginId, TEST_CASE("versionstamp_unit_test") { // a random 12 bytes long StringRef as a versionstamp StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12"_sr; - Versionstamp vs(str), vs2(str); + TupleVersionstamp vs(str), vs2(str); ASSERT(vs == vs2); ASSERT(vs.begin() != vs2.begin()); @@ -1031,7 +1031,7 @@ TEST_CASE("versionstamp_unit_test") { TEST_CASE("tuple_support_versionstamp") { // a random 12 bytes long StringRef as a versionstamp StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12"_sr; - Versionstamp vs(str); + TupleVersionstamp vs(str); const Tuple t = Tuple::makeTuple(prefix, RECORD, vs, "{K[3]}"_sr, "{...}"_sr); ASSERT(t.getVersionstamp(2) == vs); @@ -1047,7 +1047,7 @@ TEST_CASE("tuple_fail_to_append_truncated_versionstamp") { // a truncated 11 bytes long StringRef as a versionstamp StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11"_sr; try { - Versionstamp truncatedVersionstamp(str); + TupleVersionstamp truncatedVersionstamp(str); } catch (Error& e) { return; } @@ -1058,7 +1058,7 @@ TEST_CASE("tuple_fail_to_append_longer_versionstamp") { // a longer than expected 13 bytes long StringRef as a versionstamp StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11"_sr; try { - Versionstamp longerVersionstamp(str); + TupleVersionstamp longerVersionstamp(str); } catch (Error& e) { return; } @@ -2761,6 +2761,7 @@ TEST_CASE("Blob Granule Functions") { auto confValue = get_value("\xff/conf/blob_granules_enabled", /* snapshot */ false, { FDB_TR_OPTION_READ_SYSTEM_KEYS }); if (!confValue.has_value() || confValue.value() != "1") { + // std::cout << "skipping blob granule test" << std::endl; return; } @@ -2817,7 +2818,6 @@ TEST_CASE("Blob Granule Functions") { fdb::KeyValueArrayResult r = tr.read_blob_granules(key("bg"), key("bh"), originalReadVersion, -2, granuleContext); fdb_error_t err = r.get(&out_kv, &out_count, &out_more); - ; if (err && err != 2037 /* blob_granule_not_materialized */) { fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); @@ -2865,6 +2865,10 @@ TEST_CASE("Blob Granule Functions") { int out_count; fdb_check(f.get(&out_kr, &out_count)); + CHECK(std::string((const char*)out_kr[0].begin_key, out_kr[0].begin_key_length) <= key("bg")); + CHECK(std::string((const char*)out_kr[out_count - 1].end_key, out_kr[out_count - 1].end_key_length) >= + key("bh")); + CHECK(out_count >= 1); // check key ranges are in order for (int i = 0; i < out_count; i++) { @@ -2872,9 +2876,9 @@ TEST_CASE("Blob Granule Functions") { CHECK(std::string((const char*)out_kr[i].begin_key, out_kr[i].begin_key_length) < std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length)); } - // Ranges themselves are sorted + // Ranges themselves are sorted and contiguous for (int i = 0; i < out_count - 1; i++) { - CHECK(std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length) <= + CHECK(std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length) == std::string((const char*)out_kr[i + 1].begin_key, out_kr[i + 1].begin_key_length)); } @@ -2900,7 +2904,6 @@ TEST_CASE("Blob Granule Functions") { fdb_check(wait_future(waitPurgeFuture)); // re-read again at the purge version to make sure it is still valid - while (1) { fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); fdb::KeyValueArrayResult r = @@ -2917,6 +2920,56 @@ TEST_CASE("Blob Granule Functions") { tr.reset(); break; } + + // check granule summary + while (1) { + fdb::GranuleSummaryArrayFuture f = tr.summarize_blob_granules(key("bg"), key("bh"), originalReadVersion, 100); + fdb_error_t err = wait_future(f); + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + const FDBGranuleSummary* out_summaries; + int out_count; + fdb_check(f.get(&out_summaries, &out_count)); + + CHECK(out_count >= 1); + CHECK(out_count <= 100); + + // check that ranges cover requested range + CHECK(std::string((const char*)out_summaries[0].key_range.begin_key, + out_summaries[0].key_range.begin_key_length) <= key("bg")); + CHECK(std::string((const char*)out_summaries[out_count - 1].key_range.end_key, + out_summaries[out_count - 1].key_range.end_key_length) >= key("bh")); + + // check key ranges are in order + for (int i = 0; i < out_count; i++) { + // key range start < end + CHECK(std::string((const char*)out_summaries[i].key_range.begin_key, + out_summaries[i].key_range.begin_key_length) < + std::string((const char*)out_summaries[i].key_range.end_key, + out_summaries[i].key_range.end_key_length)); + // sanity check versions and sizes + CHECK(out_summaries[i].snapshot_version <= originalReadVersion); + CHECK(out_summaries[i].delta_version <= originalReadVersion); + CHECK(out_summaries[i].snapshot_version <= out_summaries[i].delta_version); + CHECK(out_summaries[i].snapshot_size > 0); + CHECK(out_summaries[i].delta_size >= 0); + } + + // Ranges themselves are sorted and contiguous + for (int i = 0; i < out_count - 1; i++) { + CHECK(std::string((const char*)out_summaries[i].key_range.end_key, + out_summaries[i].key_range.end_key_length) == + std::string((const char*)out_summaries[i + 1].key_range.begin_key, + out_summaries[i + 1].key_range.begin_key_length)); + } + + tr.reset(); + break; + } } int main(int argc, char** argv) { @@ -2926,7 +2979,7 @@ int main(int argc, char** argv) { << std::endl; return 1; } - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); if (argc >= 4) { std::string externalClientLibrary = argv[3]; if (externalClientLibrary.substr(0, 2) != "--") { diff --git a/bindings/c/test/workloads/SimpleWorkload.cpp b/bindings/c/test/workloads/SimpleWorkload.cpp index 95be0ddd59..4dd7a800fe 100644 --- a/bindings/c/test/workloads/SimpleWorkload.cpp +++ b/bindings/c/test/workloads/SimpleWorkload.cpp @@ -266,7 +266,7 @@ struct SimpleWorkload final : FDBWorkload { insertsPerTx = context->getOption("insertsPerTx", 100ul); opsPerTx = context->getOption("opsPerTx", 100ul); runFor = context->getOption("runFor", 10.0); - auto err = fdb_select_api_version(720); + auto err = fdb_select_api_version(FDB_API_VERSION); if (err) { context->trace( FDBSeverity::Info, "SelectAPIVersionFailed", { { "Error", std::string(fdb_get_error(err)) } }); diff --git a/bindings/flow/DirectoryLayer.actor.cpp b/bindings/flow/DirectoryLayer.actor.cpp index 056b203a2e..13d3970ed3 100644 --- a/bindings/flow/DirectoryLayer.actor.cpp +++ b/bindings/flow/DirectoryLayer.actor.cpp @@ -23,17 +23,17 @@ namespace FDB { const uint8_t DirectoryLayer::LITTLE_ENDIAN_LONG_ONE[8] = { 1, 0, 0, 0, 0, 0, 0, 0 }; -const StringRef DirectoryLayer::HIGH_CONTENTION_KEY = LiteralStringRef("hca"); -const StringRef DirectoryLayer::LAYER_KEY = LiteralStringRef("layer"); -const StringRef DirectoryLayer::VERSION_KEY = LiteralStringRef("version"); +const StringRef DirectoryLayer::HIGH_CONTENTION_KEY = "hca"_sr; +const StringRef DirectoryLayer::LAYER_KEY = "layer"_sr; +const StringRef DirectoryLayer::VERSION_KEY = "version"_sr; const int64_t DirectoryLayer::SUB_DIR_KEY = 0; const uint32_t DirectoryLayer::VERSION[3] = { 1, 0, 0 }; -const StringRef DirectoryLayer::DEFAULT_NODE_SUBSPACE_PREFIX = LiteralStringRef("\xfe"); +const StringRef DirectoryLayer::DEFAULT_NODE_SUBSPACE_PREFIX = "\xfe"_sr; const Subspace DirectoryLayer::DEFAULT_NODE_SUBSPACE = Subspace(DEFAULT_NODE_SUBSPACE_PREFIX); const Subspace DirectoryLayer::DEFAULT_CONTENT_SUBSPACE = Subspace(); -const StringRef DirectoryLayer::PARTITION_LAYER = LiteralStringRef("partition"); +const StringRef DirectoryLayer::PARTITION_LAYER = "partition"_sr; DirectoryLayer::DirectoryLayer(Subspace nodeSubspace, Subspace contentSubspace, bool allowManualPrefixes) : rootNode(nodeSubspace.get(nodeSubspace.key())), nodeSubspace(nodeSubspace), contentSubspace(contentSubspace), diff --git a/bindings/flow/FDBLoanerTypes.h b/bindings/flow/FDBLoanerTypes.h index 01000f6e27..ddd9a577b5 100644 --- a/bindings/flow/FDBLoanerTypes.h +++ b/bindings/flow/FDBLoanerTypes.h @@ -31,7 +31,7 @@ typedef Standalone Key; typedef Standalone Value; inline Key keyAfter(const KeyRef& key) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; Standalone r; @@ -43,7 +43,7 @@ inline Key keyAfter(const KeyRef& key) { } inline KeyRef keyAfter(const KeyRef& key, Arena& arena) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; uint8_t* t = new (arena) uint8_t[key.size() + 1]; memcpy(t, key.begin(), key.size()); diff --git a/bindings/flow/Tuple.h b/bindings/flow/Tuple.h index a7feab9419..536d90bb41 100644 --- a/bindings/flow/Tuple.h +++ b/bindings/flow/Tuple.h @@ -24,7 +24,9 @@ #pragma once #include "bindings/flow/fdb_flow.h" -#include "fdbclient/Versionstamp.h" +#include "fdbclient/TupleVersionstamp.h" + +typedef TupleVersionstamp Versionstamp; namespace FDB { struct Uuid { diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp index 72ee49dcf4..d6e1431c77 100644 --- a/bindings/flow/fdb_flow.actor.cpp +++ b/bindings/flow/fdb_flow.actor.cpp @@ -38,7 +38,7 @@ THREAD_FUNC networkThread(void* fdb) { } ACTOR Future _test() { - API* fdb = FDB::API::selectAPIVersion(720); + API* fdb = FDB::API::selectAPIVersion(FDB_API_VERSION); auto db = fdb->createDatabase(); state Reference tr = db->createTransaction(); @@ -63,15 +63,14 @@ ACTOR Future _test() { // wait( waitForAllReady( versions ) ); printf("Elapsed: %lf\n", timer_monotonic() - starttime); - tr->set(LiteralStringRef("foo"), LiteralStringRef("bar")); + tr->set("foo"_sr, "bar"_sr); - Optional> v = wait(tr->get(LiteralStringRef("foo"))); + Optional> v = wait(tr->get("foo"_sr)); if (v.present()) { printf("%s\n", v.get().toString().c_str()); } - FDBStandalone r = - wait(tr->getRange(KeyRangeRef(LiteralStringRef("a"), LiteralStringRef("z")), 100)); + FDBStandalone r = wait(tr->getRange(KeyRangeRef("a"_sr, "z"_sr), 100)); for (auto kv : r) { printf("%s is %s\n", kv.key.toString().c_str(), kv.value.toString().c_str()); @@ -82,7 +81,7 @@ ACTOR Future _test() { } void fdb_flow_test() { - API* fdb = FDB::API::selectAPIVersion(720); + API* fdb = FDB::API::selectAPIVersion(FDB_API_VERSION); fdb->setupNetwork(); startThread(networkThread, fdb); diff --git a/bindings/flow/tester/DirectoryTester.actor.cpp b/bindings/flow/tester/DirectoryTester.actor.cpp index b21da1097c..a8fabdca4c 100644 --- a/bindings/flow/tester/DirectoryTester.actor.cpp +++ b/bindings/flow/tester/DirectoryTester.actor.cpp @@ -545,11 +545,10 @@ struct DirectoryLogDirectoryFunc : InstructionFunc { pathTuple.append(p, true); } - instruction->tr->set(logSubspace.pack(LiteralStringRef("path"), true), pathTuple.pack()); - instruction->tr->set(logSubspace.pack(LiteralStringRef("layer"), true), - Tuple().append(directory->getLayer()).pack()); - instruction->tr->set(logSubspace.pack(LiteralStringRef("exists"), true), Tuple().append(exists ? 1 : 0).pack()); - instruction->tr->set(logSubspace.pack(LiteralStringRef("children"), true), childrenTuple.pack()); + instruction->tr->set(logSubspace.pack("path"_sr, true), pathTuple.pack()); + instruction->tr->set(logSubspace.pack("layer"_sr, true), Tuple().append(directory->getLayer()).pack()); + instruction->tr->set(logSubspace.pack("exists"_sr, true), Tuple().append(exists ? 1 : 0).pack()); + instruction->tr->set(logSubspace.pack("children"_sr, true), childrenTuple.pack()); return Void(); } diff --git a/bindings/flow/tester/Tester.actor.cpp b/bindings/flow/tester/Tester.actor.cpp index 941e1b97b2..f300127e5d 100644 --- a/bindings/flow/tester/Tester.actor.cpp +++ b/bindings/flow/tester/Tester.actor.cpp @@ -470,12 +470,12 @@ ACTOR Future> waitForVoid(Future f) { try { wait(f); Tuple t; - t.append(LiteralStringRef("RESULT_NOT_PRESENT")); + t.append("RESULT_NOT_PRESENT"_sr); return t.pack(); } catch (Error& e) { // printf("FDBError1:%d\n", e.code()); Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", e.code())); // pack above as error string into another tuple Tuple ret; @@ -493,7 +493,7 @@ ACTOR Future> waitForValue(Future> f } catch (Error& e) { // printf("FDBError2:%d\n", e.code()); Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", e.code())); // pack above as error string into another tuple Tuple ret; @@ -509,7 +509,7 @@ ACTOR Future> waitForValue(Future> waitForValue(Future> getKey(Future> f, Stan } catch (Error& e) { // printf("FDBError4:%d\n", e.code()); Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", e.code())); // pack above as error string into another tuple Tuple ret; @@ -670,7 +670,7 @@ struct GetEstimatedRangeSize : InstructionFunc { state Standalone endKey = Tuple::unpack(s2).getString(0); Future fsize = instruction->tr->getEstimatedRangeSizeBytes(KeyRangeRef(beginKey, endKey)); int64_t size = wait(fsize); - data->stack.pushTuple(LiteralStringRef("GOT_ESTIMATED_RANGE_SIZE")); + data->stack.pushTuple("GOT_ESTIMATED_RANGE_SIZE"_sr); return Void(); } @@ -698,7 +698,7 @@ struct GetRangeSplitPoints : InstructionFunc { Future>> fsplitPoints = instruction->tr->getRangeSplitPoints(KeyRangeRef(beginKey, endKey), chunkSize); FDBStandalone> splitPoints = wait(fsplitPoints); - data->stack.pushTuple(LiteralStringRef("GOT_RANGE_SPLIT_POINTS")); + data->stack.pushTuple("GOT_RANGE_SPLIT_POINTS"_sr); return Void(); } @@ -743,7 +743,7 @@ struct GetReadVersionFunc : InstructionFunc { ACTOR static Future call(Reference data, Reference instruction) { Version v = wait(instruction->tr->getReadVersion()); data->lastVersion = v; - data->stack.pushTuple(LiteralStringRef("GOT_READ_VERSION")); + data->stack.pushTuple("GOT_READ_VERSION"_sr); return Void(); } }; @@ -767,7 +767,7 @@ struct GetCommittedVersionFunc : InstructionFunc { static Future call(Reference const& data, Reference const& instruction) { data->lastVersion = instruction->tr->getCommittedVersion(); - data->stack.pushTuple(LiteralStringRef("GOT_COMMITTED_VERSION")); + data->stack.pushTuple("GOT_COMMITTED_VERSION"_sr); return Void(); } }; @@ -781,7 +781,7 @@ struct GetApproximateSizeFunc : InstructionFunc { ACTOR static Future call(Reference data, Reference instruction) { int64_t _ = wait(instruction->tr->getApproximateSize()); (void)_; // disable unused variable warning - data->stack.pushTuple(LiteralStringRef("GOT_APPROXIMATE_SIZE")); + data->stack.pushTuple("GOT_APPROXIMATE_SIZE"_sr); return Void(); } }; @@ -1485,7 +1485,7 @@ struct ReadConflictKeyFunc : InstructionFunc { // printf("=========READ_CONFLICT_KEY:%s\n", printable(key).c_str()); instruction->tr->addReadConflictKey(key); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_KEY")); + data->stack.pushTuple("SET_CONFLICT_KEY"_sr); return Void(); } }; @@ -1506,7 +1506,7 @@ struct WriteConflictKeyFunc : InstructionFunc { // printf("=========WRITE_CONFLICT_KEY:%s\n", printable(key).c_str()); instruction->tr->addWriteConflictKey(key); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_KEY")); + data->stack.pushTuple("SET_CONFLICT_KEY"_sr); return Void(); } }; @@ -1529,7 +1529,7 @@ struct ReadConflictRangeFunc : InstructionFunc { // printf("=========READ_CONFLICT_RANGE:%s:%s\n", printable(begin).c_str(), printable(end).c_str()); instruction->tr->addReadConflictRange(KeyRange(KeyRangeRef(begin, end))); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_RANGE")); + data->stack.pushTuple("SET_CONFLICT_RANGE"_sr); return Void(); } }; @@ -1553,7 +1553,7 @@ struct WriteConflictRangeFunc : InstructionFunc { // printf("=========WRITE_CONFLICT_RANGE:%s:%s\n", printable(begin).c_str(), printable(end).c_str()); instruction->tr->addWriteConflictRange(KeyRange(KeyRangeRef(begin, end))); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_RANGE")); + data->stack.pushTuple("SET_CONFLICT_RANGE"_sr); return Void(); } }; @@ -1643,10 +1643,8 @@ struct UnitTestsFunc : InstructionFunc { Optional(StringRef((const uint8_t*)&locationCacheSize, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MAX_WATCHES, Optional(StringRef((const uint8_t*)&maxWatches, 8))); - data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, - Optional(LiteralStringRef("dc_id"))); - data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, - Optional(LiteralStringRef("machine_id"))); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, Optional("dc_id"_sr)); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, Optional("machine_id"_sr)); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, @@ -1685,13 +1683,13 @@ struct UnitTestsFunc : InstructionFunc { Optional(StringRef((const uint8_t*)&maxRetryDelay, 8))); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_USED_DURING_COMMIT_PROTECTION_DISABLE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TRANSACTION_LOGGING_ENABLE, - Optional(LiteralStringRef("my_transaction"))); + Optional("my_transaction"_sr)); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_LOCK_AWARE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_INCLUDE_PORT_IN_ADDRESS); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_REPORT_CONFLICTING_KEYS); - Optional> _ = wait(tr->get(LiteralStringRef("\xff"))); + Optional> _ = wait(tr->get("\xff"_sr)); tr->cancel(); return Void(); @@ -1724,13 +1722,13 @@ ACTOR static Future doInstructions(Reference data) { Tuple opTuple = Tuple::unpack(data->instructions[idx].value); state Standalone op = opTuple.getString(0); - state bool isDatabase = op.endsWith(LiteralStringRef("_DATABASE")); - state bool isSnapshot = op.endsWith(LiteralStringRef("_SNAPSHOT")); - state bool isDirectory = op.startsWith(LiteralStringRef("DIRECTORY_")); + state bool isDatabase = op.endsWith("_DATABASE"_sr); + state bool isSnapshot = op.endsWith("_SNAPSHOT"_sr); + state bool isDirectory = op.startsWith("DIRECTORY_"_sr); try { if (LOG_INSTRUCTIONS) { - if (op != LiteralStringRef("SWAP") && op != LiteralStringRef("PUSH")) { + if (op != "SWAP"_sr && op != "PUSH"_sr) { printf("%zu. %s\n", idx, tupleToString(opTuple).c_str()); fflush(stdout); } @@ -1773,7 +1771,7 @@ ACTOR static Future doInstructions(Reference data) { if (opsThatCreateDirectories.count(op.toString())) { data->directoryData.directoryList.push_back(DirectoryOrSubspace()); } - data->stack.pushTuple(LiteralStringRef("DIRECTORY_ERROR")); + data->stack.pushTuple("DIRECTORY_ERROR"_sr); } else { data->stack.pushError(e.code()); } @@ -1873,7 +1871,7 @@ ACTOR void _test_versionstamp() { try { g_network = newNet2(TLSConfig()); - API* fdb = FDB::API::selectAPIVersion(720); + API* fdb = FDB::API::selectAPIVersion(FDB_API_VERSION); fdb->setupNetwork(); startThread(networkThread, fdb); @@ -1883,15 +1881,14 @@ ACTOR void _test_versionstamp() { state Future> ftrVersion = tr->getVersionstamp(); - tr->atomicOp(LiteralStringRef("foo"), - LiteralStringRef("blahblahbl\x00\x00\x00\x00"), - FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE); + tr->atomicOp( + "foo"_sr, "blahblahbl\x00\x00\x00\x00"_sr, FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE); wait(tr->commit()); // should use retry loop tr->reset(); - Optional> optionalDbVersion = wait(tr->get(LiteralStringRef("foo"))); + Optional> optionalDbVersion = wait(tr->get("foo"_sr)); state FDBStandalone dbVersion = optionalDbVersion.get(); FDBStandalone trVersion = wait(ftrVersion); diff --git a/bindings/flow/tester/Tester.actor.h b/bindings/flow/tester/Tester.actor.h index 63fc9fe9a3..f42f8d74ee 100644 --- a/bindings/flow/tester/Tester.actor.h +++ b/bindings/flow/tester/Tester.actor.h @@ -71,7 +71,7 @@ struct FlowTesterStack { void pushError(int errorCode) { FDB::Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", errorCode)); // pack above as error string into another tuple pushTuple(t.pack().toString()); diff --git a/bindings/go/src/fdb/fdb.go b/bindings/go/src/fdb/fdb.go index 7800dd9bf5..e308049be0 100644 --- a/bindings/go/src/fdb/fdb.go +++ b/bindings/go/src/fdb/fdb.go @@ -128,7 +128,7 @@ func APIVersion(version int) error { return errAPIVersionAlreadySet } - if version < 200 || version > 720 { + if version < 200 || version > headerVersion { return errAPIVersionNotSupported } diff --git a/bindings/go/src/fdb/fdb_test.go b/bindings/go/src/fdb/fdb_test.go index 976a3ec9d0..00b3f41304 100644 --- a/bindings/go/src/fdb/fdb_test.go +++ b/bindings/go/src/fdb/fdb_test.go @@ -29,10 +29,12 @@ import ( "github.com/apple/foundationdb/bindings/go/src/fdb" ) +const API_VERSION int = 720 + func ExampleOpenDefault() { var e error - e = fdb.APIVersion(720) + e = fdb.APIVersion(API_VERSION) if e != nil { fmt.Printf("Unable to set API version: %v\n", e) return @@ -52,7 +54,7 @@ func ExampleOpenDefault() { } func TestVersionstamp(t *testing.T) { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() setVs := func(t fdb.Transactor, key fdb.Key) (fdb.FutureKey, error) { @@ -98,7 +100,7 @@ func TestVersionstamp(t *testing.T) { } func TestReadTransactionOptions(t *testing.T) { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() _, e := db.ReadTransact(func(rtr fdb.ReadTransaction) (interface{}, error) { rtr.Options().SetAccessSystemKeys() @@ -110,7 +112,7 @@ func TestReadTransactionOptions(t *testing.T) { } func ExampleTransactor() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() setOne := func(t fdb.Transactor, key fdb.Key, value []byte) error { @@ -161,7 +163,7 @@ func ExampleTransactor() { } func ExampleReadTransactor() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() getOne := func(rt fdb.ReadTransactor, key fdb.Key) ([]byte, error) { @@ -214,7 +216,7 @@ func ExampleReadTransactor() { } func ExamplePrefixRange() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() tr, e := db.CreateTransaction() @@ -253,7 +255,7 @@ func ExamplePrefixRange() { } func ExampleRangeIterator() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() tr, e := db.CreateTransaction() diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index c3a7873466..b765e09508 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -102,6 +102,11 @@ func (o NetworkOptions) SetTraceFileIdentifier(param string) error { return o.setOpt(36, []byte(param)) } +// Use the same base trace file name for all client threads as it did before version 7.2. The current default behavior is to use distinct trace file names for client threads by including their version and thread index. +func (o NetworkOptions) SetTraceShareAmongClientThreads() error { + return o.setOpt(37, nil) +} + // Set file suffix for partially written log files. // // Parameter: Append this suffix to partially written log files. When a log file is complete, it is renamed to remove the suffix. No separator is added between the file and the suffix. If you want to add a file extension, you should include the separator - e.g. '.tmp' instead of 'tmp' to add the 'tmp' extension. @@ -261,6 +266,11 @@ func (o NetworkOptions) SetEnableRunLoopProfiling() error { return o.setOpt(71, nil) } +// Prevents the multi-version client API from being disabled, even if no external clients are configured. This option is required to use GRV caching. +func (o NetworkOptions) SetDisableClientBypass() error { + return o.setOpt(72, nil) +} + // Enable client buggify - will make requests randomly fail (intended for client testing) func (o NetworkOptions) SetClientBuggifyEnable() error { return o.setOpt(80, nil) @@ -617,11 +627,18 @@ func (o TransactionOptions) SetBypassUnreadable() error { return o.setOpt(1100, nil) } -// Allows this transaction to use cached GRV from the database context. Defaults to off. Upon first usage, starts a background updater to periodically update the cache to avoid stale read versions. +// Allows this transaction to use cached GRV from the database context. Defaults to off. Upon first usage, starts a background updater to periodically update the cache to avoid stale read versions. The disable_client_bypass option must also be set. func (o TransactionOptions) SetUseGrvCache() error { return o.setOpt(1101, nil) } +// Attach given authorization token to the transaction such that subsequent tenant-aware requests are authorized +// +// Parameter: A JSON Web Token authorized to access data belonging to one or more tenants, indicated by 'tenants' claim of the token's payload. +func (o TransactionOptions) SetAuthorizationToken(param string) error { + return o.setOpt(2000, []byte(param)) +} + type StreamingMode int const ( diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 7057f22384..6b2be4dd32 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -140,20 +140,19 @@ vexillographer_compile(TARGET fdb_java_options LANG java OUT ${GENERATED_JAVA_DI OUTPUT ${GENERATED_JAVA_FILES}) set(SYSTEM_NAME "linux") -if (APPLE) +if(APPLE) set(SYSTEM_NAME "osx") endif() if(OPEN_FOR_IDE) add_library(fdb_java OBJECT fdbJNI.cpp) - add_library(java_workloads OBJECT JavaWorkload.cpp) else() add_library(fdb_java SHARED fdbJNI.cpp) add_library(java_workloads SHARED JavaWorkload.cpp) target_link_libraries(java_workloads PRIVATE fdb_java_native) endif() -if (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) +if(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) target_link_options(java_workloads PRIVATE "LINKER:--version-script=${CMAKE_SOURCE_DIR}/bindings/c/external_workload.map,-z,nodelete") endif() @@ -164,11 +163,13 @@ target_link_libraries(fdb_java PRIVATE fdb_java_native) if(APPLE) set_target_properties(fdb_java PROPERTIES SUFFIX ".jnilib") endif() -set_target_properties(java_workloads PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb") -target_link_libraries(java_workloads PUBLIC fdb_c ${JNI_LIBRARIES}) -target_link_libraries(java_workloads PRIVATE flow) # mostly for boost -target_include_directories(java_workloads PUBLIC ${JNI_INCLUDE_DIRS}) +if(NOT OPEN_FOR_IDE) + set_target_properties(java_workloads PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb") + target_link_libraries(java_workloads PUBLIC fdb_c ${JNI_LIBRARIES}) + target_link_libraries(java_workloads PRIVATE flow) # mostly for boost + target_include_directories(java_workloads PUBLIC ${JNI_INCLUDE_DIRS}) +endif() set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.8" "-target" "1.8" "-XDignore.symbol.file") set(CMAKE_JNI_TARGET TRUE) @@ -240,18 +241,18 @@ if(NOT OPEN_FOR_IDE) set(lib_destination "osx/x86_64") endif() else() - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(lib_destination "linux/aarch64") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") - set(lib_destination "linux/ppc64le") - else() - set(lib_destination "linux/amd64") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(lib_destination "linux/aarch64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(lib_destination "linux/ppc64le") + else() + set(lib_destination "linux/amd64") endif() endif() - set(lib_destination "${unpack_dir}/lib/${lib_destination}") - set(jni_package "${CMAKE_BINARY_DIR}/packages/lib") + set(lib_destination "${unpack_dir}/lib/${lib_destination}") + set(jni_package "${CMAKE_BINARY_DIR}/packages/lib") file(MAKE_DIRECTORY ${lib_destination}) - file(MAKE_DIRECTORY ${jni_package}) + file(MAKE_DIRECTORY ${jni_package}) add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib_copied COMMAND ${CMAKE_COMMAND} -E copy $ ${lib_destination} && ${CMAKE_COMMAND} -E copy $ ${jni_package} && @@ -290,7 +291,7 @@ if(NOT OPEN_FOR_IDE) set(TEST_CP ${tests_jar} ${target_jar}) if(RUN_JUNIT_TESTS OR RUN_JAVA_INTEGRATION_TESTS) - if (USE_SANITIZER) + if(USE_SANITIZER) message(WARNING "Cannot run java tests with sanitizer builds") return() endif() @@ -299,7 +300,7 @@ if(NOT OPEN_FOR_IDE) file(DOWNLOAD "https://search.maven.org/remotecontent?filepath=org/junit/jupiter/junit-jupiter-engine/5.7.1/junit-jupiter-engine-5.7.1.jar" ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar EXPECTED_HASH SHA256=56616c9350b3624f76cffef6b24ce7bb222915bfd5688f96d3cf4cef34f077cb) - # https://search.maven.org/remotecontent?filepath=org/junit/jupiter/junit-jupiter-api/5.7.1/junit-jupiter-api-5.7.1.jar + # https://search.maven.org/remotecontent?filepath=org/junit/jupiter/junit-jupiter-api/5.7.1/junit-jupiter-api-5.7.1.jar file(DOWNLOAD "https://search.maven.org/remotecontent?filepath=org/junit/jupiter/junit-jupiter-api/5.7.1/junit-jupiter-api-5.7.1.jar" ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar EXPECTED_HASH SHA256=ce7b985bc469e2625759a4ebc45533c70581a05a348278c1d6408e9b2e35e314) @@ -350,20 +351,20 @@ if(NOT OPEN_FOR_IDE) # can be found at https://cmake.org/cmake/help/v3.19/manual/ctest.1.html) add_jar(fdb-junit SOURCES ${JAVA_JUNIT_TESTS} ${JUNIT_RESOURCES} INCLUDE_JARS fdb-java - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-params-5.7.1.jar - ${CMAKE_BINARY_DIR}/packages/opentest4j-1.2.0.jar - ${CMAKE_BINARY_DIR}/packages/apiguardian-api-1.1.1.jar - ) + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-params-5.7.1.jar + ${CMAKE_BINARY_DIR}/packages/opentest4j-1.2.0.jar + ${CMAKE_BINARY_DIR}/packages/apiguardian-api-1.1.1.jar + ) get_property(junit_jar_path TARGET fdb-junit PROPERTY JAR_FILE) add_test(NAME java-unit COMMAND ${Java_JAVA_EXECUTABLE} - -classpath "${target_jar}:${junit_jar_path}:${JUNIT_CLASSPATH}" - -Djava.library.path=${CMAKE_BINARY_DIR}/lib - org.junit.platform.console.ConsoleLauncher "--details=summary" "-class-path=${junit_jar_path}" "--scan-classpath" "--disable-banner" - ) + -classpath "${target_jar}:${junit_jar_path}:${JUNIT_CLASSPATH}" + -Djava.library.path=${CMAKE_BINARY_DIR}/lib + org.junit.platform.console.ConsoleLauncher "--details=summary" "-class-path=${junit_jar_path}" "--scan-classpath" "--disable-banner" + ) endif() @@ -393,28 +394,28 @@ if(NOT OPEN_FOR_IDE) # the directory layer with a unique path, etc.) # add_jar(fdb-integration SOURCES ${JAVA_INTEGRATION_TESTS} ${JAVA_INTEGRATION_RESOURCES} INCLUDE_JARS fdb-java - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar - ${CMAKE_BINARY_DIR}/packages/junit-jupiter-params-5.7.1.jar - ${CMAKE_BINARY_DIR}/packages/opentest4j-1.2.0.jar - ${CMAKE_BINARY_DIR}/packages/apiguardian-api-1.1.1.jar) + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-api-5.7.1.jar + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-engine-5.7.1.jar + ${CMAKE_BINARY_DIR}/packages/junit-jupiter-params-5.7.1.jar + ${CMAKE_BINARY_DIR}/packages/opentest4j-1.2.0.jar + ${CMAKE_BINARY_DIR}/packages/apiguardian-api-1.1.1.jar) get_property(integration_jar_path TARGET fdb-integration PROPERTY JAR_FILE) # add_fdbclient_test will set FDB_CLUSTER_FILE if it's not set already add_fdbclient_test(NAME java-integration - COMMAND ${Java_JAVA_EXECUTABLE} - -classpath "${target_jar}:${integration_jar_path}:${JUNIT_CLASSPATH}" - -Djava.library.path=${CMAKE_BINARY_DIR}/lib - org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" "-T MultiClient" - ) + COMMAND ${Java_JAVA_EXECUTABLE} + -classpath "${target_jar}:${integration_jar_path}:${JUNIT_CLASSPATH}" + -Djava.library.path=${CMAKE_BINARY_DIR}/lib + org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" "-T MultiClient" + ) add_multi_fdbclient_test(NAME java-multi-integration - COMMAND ${Java_JAVA_EXECUTABLE} - -classpath "${target_jar}:${integration_jar_path}:${JUNIT_CLASSPATH}" - -Djava.library.path=${CMAKE_BINARY_DIR}/lib - org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" "-t MultiClient" - ) + COMMAND ${Java_JAVA_EXECUTABLE} + -classpath "${target_jar}:${integration_jar_path}:${JUNIT_CLASSPATH}" + -Djava.library.path=${CMAKE_BINARY_DIR}/lib + org.junit.platform.console.ConsoleLauncher "--details=summary" "--class-path=${integration_jar_path}" "--scan-classpath" "--disable-banner" "-t MultiClient" + ) endif() endif() diff --git a/bindings/java/JavaWorkload.cpp b/bindings/java/JavaWorkload.cpp index e9bf309fa4..6c65313f4b 100644 --- a/bindings/java/JavaWorkload.cpp +++ b/bindings/java/JavaWorkload.cpp @@ -379,7 +379,7 @@ struct JVM { jmethodID selectMethod = env->GetStaticMethodID(fdbClass, "selectAPIVersion", "(I)Lcom/apple/foundationdb/FDB;"); checkException(); - auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(720)); + auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(FDB_API_VERSION)); checkException(); env->CallObjectMethod(fdbInstance, getMethod(fdbClass, "disableShutdownHook", "()V")); checkException(); diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index c2b5ea90cc..660e6945fa 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -1037,7 +1037,7 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verify return 0; } - FDBFuture* f = fdb_database_list_blobbified_ranges( + FDBFuture* f = fdb_database_verify_blob_range( tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version); jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); diff --git a/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java index 5087361c43..80afa6c761 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java @@ -40,6 +40,8 @@ import org.junit.jupiter.api.Assertions; * This test is to verify the atomicity of transactions. */ public class CycleMultiClientIntegrationTest { + public static final int API_VERSION = 720; + public static final MultiClientHelper clientHelper = new MultiClientHelper(); // more write txn than validate txn, as parent thread waits only for validate txn. @@ -51,7 +53,7 @@ public class CycleMultiClientIntegrationTest { private static List expected = new ArrayList<>(Arrays.asList("0", "1", "2", "3")); public static void main(String[] args) throws Exception { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); setupThreads(fdb); Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us System.out.println("Starting tests"); diff --git a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java index 59fbc3fe55..b91c9e7de3 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java @@ -40,7 +40,8 @@ import org.junit.jupiter.api.extension.ExtendWith; */ @ExtendWith(RequiresDatabase.class) class DirectoryTest { - private static final FDB fdb = FDB.selectAPIVersion(720); + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); @Test void testCanCreateDirectory() throws Exception { diff --git a/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java index 063e9e276d..3aedef4d1e 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java @@ -41,7 +41,8 @@ import org.junit.jupiter.api.extension.ExtendWith; @ExtendWith(RequiresDatabase.class) class MappedRangeQueryIntegrationTest { - private static final FDB fdb = FDB.selectAPIVersion(720); + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); public String databaseArg = null; private Database openFDB() { return fdb.open(databaseArg); } @@ -110,7 +111,7 @@ class MappedRangeQueryIntegrationTest { boolean validate = true; @Test void comparePerformance() { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try (Database db = openFDB()) { insertRecordsWithIndexes(numRecords, db); instrument(rangeQueryAndThenRangeQueries, "rangeQueryAndThenRangeQueries", db); diff --git a/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java index bc64877199..fb6d3afd9f 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java @@ -41,7 +41,8 @@ import org.junit.jupiter.api.extension.ExtendWith; */ @ExtendWith(RequiresDatabase.class) class RangeQueryIntegrationTest { - private static final FDB fdb = FDB.selectAPIVersion(720); + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); @BeforeEach @AfterEach diff --git a/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java b/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java index c11940d41a..ab8ab1364a 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java @@ -41,6 +41,8 @@ import org.junit.jupiter.api.Assertions; * are still seeting the initialValue even after new transactions set them to a new value. */ public class RepeatableReadMultiThreadClientTest { + public static final int API_VERSION = 720; + public static final MultiClientHelper clientHelper = new MultiClientHelper(); private static final int oldValueReadCount = 30; @@ -52,7 +54,7 @@ public class RepeatableReadMultiThreadClientTest { private static final Map threadToOldValueReaders = new HashMap<>(); public static void main(String[] args) throws Exception { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); setupThreads(fdb); Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us System.out.println("Starting tests"); diff --git a/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java b/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java index 785008902c..ead1f499c1 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java +++ b/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java @@ -47,6 +47,7 @@ import org.opentest4j.TestAbortedException; * be running a server and you don't want to deal with spurious test failures. */ public class RequiresDatabase implements ExecutionCondition, BeforeAllCallback { + public static final int API_VERSION = 720; public static boolean canRunIntegrationTest() { String prop = System.getProperty("run.integration.tests"); @@ -80,7 +81,7 @@ public class RequiresDatabase implements ExecutionCondition, BeforeAllCallback { * assume that if we are here, then canRunIntegrationTest() is returning true and we don't have to bother * checking it. */ - try (Database db = FDB.selectAPIVersion(720).open()) { + try (Database db = FDB.selectAPIVersion(API_VERSION).open()) { db.run(tr -> { CompletableFuture future = tr.get("test".getBytes()); diff --git a/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java b/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java index 4a4736d566..30f86632eb 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java @@ -19,6 +19,8 @@ import org.junit.jupiter.api.Assertions; * This test is to verify the causal consistency of transactions for mutli-threaded client. */ public class SidebandMultiThreadClientTest { + public static final int API_VERSION = 720; + public static final MultiClientHelper clientHelper = new MultiClientHelper(); private static final Map> db2Queues = new HashMap<>(); @@ -26,7 +28,7 @@ public class SidebandMultiThreadClientTest { private static final int txnCnt = 1000; public static void main(String[] args) throws Exception { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); setupThreads(fdb); Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us for (Database db : dbs) { diff --git a/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java b/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java index 6e53b179e5..fc54f6c333 100644 --- a/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java +++ b/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java @@ -29,6 +29,8 @@ import org.junit.jupiter.api.extension.ExtensionContext; * are not available for any reason. */ public class FDBLibraryRule implements BeforeAllCallback { + public static final int CURRENT_API_VERSION = 720; + private final int apiVersion; // because FDB is a singleton (currently), this isn't a super-useful cache, @@ -37,7 +39,7 @@ public class FDBLibraryRule implements BeforeAllCallback { public FDBLibraryRule(int apiVersion) { this.apiVersion = apiVersion; } - public static FDBLibraryRule current() { return new FDBLibraryRule(720); } + public static FDBLibraryRule current() { return new FDBLibraryRule(CURRENT_API_VERSION); } public static FDBLibraryRule v63() { return new FDBLibraryRule(630); } diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java index 6be76fdf32..5100dec392 100644 --- a/bindings/java/src/main/com/apple/foundationdb/Database.java +++ b/bindings/java/src/main/com/apple/foundationdb/Database.java @@ -161,6 +161,19 @@ public interface Database extends AutoCloseable, TransactionContext { */ double getMainThreadBusyness(); + /** + * Runs {@link #purgeBlobGranules(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param force if true delete all data, if not keep data >= purgeVersion + * + * @return the key to watch for purge complete + */ + default CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, boolean force) { + return purgeBlobGranules(beginKey, endKey, -2, force, getExecutor()); + } + /** * Runs {@link #purgeBlobGranules(Function)} on the default executor. * @@ -242,7 +255,7 @@ public interface Database extends AutoCloseable, TransactionContext { } /** - * Sets a range to be unblobbified in the database. + * Unsets a blobbified range in the database. The range must be aligned to known blob ranges. * * @param beginKey start of the key range * @param endKey end of the key range @@ -260,7 +273,7 @@ public interface Database extends AutoCloseable, TransactionContext { * @param rangeLimit batch size * @param e the {@link Executor} to use for asynchronous callbacks - * @return a future with the list of blobbified ranges. + * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)] */ default CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit) { return listBlobbifiedRanges(beginKey, endKey, rangeLimit, getExecutor()); @@ -274,10 +287,22 @@ public interface Database extends AutoCloseable, TransactionContext { * @param rangeLimit batch size * @param e the {@link Executor} to use for asynchronous callbacks - * @return a future with the list of blobbified ranges. + * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)] */ CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e); + /** + * Runs {@link #verifyBlobRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * + * @return a future with the version of the last blob granule. + */ + default CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey) { + return verifyBlobRange(beginKey, endKey, -2, getExecutor()); + } + /** * Runs {@link #verifyBlobRange(Function)} on the default executor. * diff --git a/bindings/java/src/main/com/apple/foundationdb/FDB.java b/bindings/java/src/main/com/apple/foundationdb/FDB.java index 5215d0836e..47ba2eead1 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDB.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java @@ -191,11 +191,6 @@ public class FDB { Select_API_version(version); singleton = new FDB(version); - if (version < 720) { - TenantManagement.TENANT_MAP_PREFIX = ByteArrayUtil.join(new byte[] { (byte)255, (byte)255 }, - "/management/tenant_map/".getBytes()); - } - return singleton; } diff --git a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java index 12aaf70322..58f223fa4b 100644 --- a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java +++ b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java @@ -262,7 +262,7 @@ public class TenantManagement { this.begin = ByteArrayUtil.join(TENANT_MAP_PREFIX, begin); this.end = ByteArrayUtil.join(TENANT_MAP_PREFIX, end); - tr.options().setReadSystemKeys(); + tr.options().setRawAccess(); tr.options().setLockAware(); firstGet = tr.getRange(this.begin, this.end, limit); diff --git a/bindings/java/src/main/overview.html.in b/bindings/java/src/main/overview.html.in index 3154efbfc3..a37a8859f9 100644 --- a/bindings/java/src/main/overview.html.in +++ b/bindings/java/src/main/overview.html.in @@ -28,8 +28,10 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.tuple.Tuple; public class Example { + public static final int apiVersion = 720; + public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(apiVersion); try(Database db = fdb.open()) { // Run an operation on the database diff --git a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java index a1d7a4d976..425c5d2369 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java @@ -29,11 +29,13 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.Transaction; public class BlockingBenchmark { + public static final int API_VERSION = 720; + private static final int REPS = 100000; private static final int PARALLEL = 100; public static void main(String[] args) throws InterruptedException { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); // The cluster file DOES NOT need to be valid, although it must exist. // This is because the database is never really contacted in this test. diff --git a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java index 38be19a60f..0eabef64c0 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java @@ -30,6 +30,8 @@ import com.apple.foundationdb.Database; import com.apple.foundationdb.FDB; public class ConcurrentGetSetGet { + public static final int API_VERSION = 720; + public static final Charset UTF8 = Charset.forName("UTF-8"); final Semaphore semaphore = new Semaphore(CONCURRENCY); @@ -48,7 +50,7 @@ public class ConcurrentGetSetGet { } public static void main(String[] args) { - try(Database database = FDB.selectAPIVersion(720).open()) { + try(Database database = FDB.selectAPIVersion(API_VERSION).open()) { new ConcurrentGetSetGet().apply(database); } } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Context.java b/bindings/java/src/test/com/apple/foundationdb/test/Context.java index 151a4ba599..dbbe7e73de 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/Context.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/Context.java @@ -84,8 +84,8 @@ abstract class Context implements Runnable, AutoCloseable { try { executeOperations(); } catch(Throwable t) { - // EAT t.printStackTrace(); + System.exit(1); } while(children.size() > 0) { //System.out.println("Shutting down...waiting on " + children.size() + " threads"); @@ -147,10 +147,11 @@ abstract class Context implements Runnable, AutoCloseable { private static synchronized boolean newTransaction(Database db, Optional tenant, String trName, boolean allowReplace) { TransactionState oldState = transactionMap.get(trName); if (oldState != null) { - releaseTransaction(oldState.transaction); - } - else if (!allowReplace) { - return false; + if (allowReplace) { + releaseTransaction(oldState.transaction); + } else { + return false; + } } TransactionState newState = new TransactionState(createTransaction(db, tenant), tenant); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Example.java b/bindings/java/src/test/com/apple/foundationdb/test/Example.java index da5bbfdc2a..d7f1336d51 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/Example.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/Example.java @@ -25,8 +25,10 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.tuple.Tuple; public class Example { + public static final int API_VERSION = 720; + public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { // Run an operation on the database diff --git a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java index 35adfa5e1f..2af2152cf3 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java @@ -28,10 +28,12 @@ import com.apple.foundationdb.KeyValue; import com.apple.foundationdb.TransactionContext; public class IterableTest { + public static final int API_VERSION = 720; + public static void main(String[] args) throws InterruptedException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { runTests(reps, db); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java index 6410165f27..969b6c75e3 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java @@ -32,9 +32,10 @@ import com.apple.foundationdb.async.AsyncUtil; import com.apple.foundationdb.tuple.ByteArrayUtil; public class LocalityTests { + public static final int API_VERSION = 720; public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database database = fdb.open(args[0])) { try(Transaction tr = database.createTransaction()) { String[] keyAddresses = LocalityUtil.getAddressesForKey(tr, "a".getBytes()).join(); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java index 6518116324..e5e0c9f9e6 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java @@ -36,6 +36,8 @@ import com.apple.foundationdb.async.AsyncIterator; import com.apple.foundationdb.tuple.ByteArrayUtil; public class ParallelRandomScan { + public static final int API_VERSION = 720; + private static final int ROWS = 1000000; private static final int DURATION_MS = 2000; private static final int PARALLELISM_MIN = 10; @@ -43,7 +45,7 @@ public class ParallelRandomScan { private static final int PARALLELISM_STEP = 5; public static void main(String[] args) throws InterruptedException { - FDB api = FDB.selectAPIVersion(720); + FDB api = FDB.selectAPIVersion(API_VERSION); try(Database database = api.open(args[0])) { for(int i = PARALLELISM_MIN; i <= PARALLELISM_MAX; i += PARALLELISM_STEP) { runTest(database, i, ROWS, DURATION_MS); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java index d847556cdc..e11f8b9793 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java @@ -29,12 +29,14 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.Transaction; public class SerialInsertion { + public static final int API_VERSION = 720; + private static final int THREAD_COUNT = 10; private static final int BATCH_SIZE = 1000; private static final int NODES = 1000000; public static void main(String[] args) { - FDB api = FDB.selectAPIVersion(720); + FDB api = FDB.selectAPIVersion(API_VERSION); try(Database database = api.open()) { long start = System.currentTimeMillis(); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java index 6e262e561f..f55af41c35 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java @@ -34,12 +34,14 @@ import com.apple.foundationdb.Transaction; import com.apple.foundationdb.async.AsyncIterable; public class SerialIteration { + public static final int API_VERSION = 720; + private static final int ROWS = 1000000; private static final int RUNS = 25; private static final int THREAD_COUNT = 1; public static void main(String[] args) throws InterruptedException { - FDB api = FDB.selectAPIVersion(720); + FDB api = FDB.selectAPIVersion(API_VERSION); try(Database database = api.open(args[0])) { for(int i = 1; i <= THREAD_COUNT; i++) { runThreadedTest(database, i); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java index 9313543d02..ea3210e2de 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java @@ -27,10 +27,12 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.TransactionContext; public class SerialTest { + public static final int API_VERSION = 720; + public static void main(String[] args) throws InterruptedException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { runTests(reps, db); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java index 1f3aec5501..6fdee20cad 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java @@ -35,11 +35,13 @@ import com.apple.foundationdb.tuple.Tuple; * Some tests regarding conflict ranges to make sure they do what we expect. */ public class SnapshotTransactionTest { + public static final int API_VERSION = 720; + private static final int CONFLICT_CODE = 1020; private static final Subspace SUBSPACE = new Subspace(Tuple.from("test", "conflict_ranges")); public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { snapshotReadShouldNotConflict(db); snapshotShouldNotAddConflictRange(db); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index 4fa45f7cbe..2ce8e76343 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -32,12 +32,14 @@ import com.apple.foundationdb.tuple.Tuple; import com.apple.foundationdb.tuple.Versionstamp; public class TupleTest { + public static final int API_VERSION = 720; + private static final byte FF = (byte)0xff; public static void main(String[] args) throws NoSuchFieldException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { runTests(reps, db); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java index b39744dd32..421db6c542 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java @@ -31,8 +31,10 @@ import com.apple.foundationdb.tuple.Tuple; import com.apple.foundationdb.tuple.Versionstamp; public class VersionstampSmokeTest { + public static final int API_VERSION = 720; + public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { db.run(tr -> { tr.clear(Tuple.from("prefix").range()); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java index 29e05db04e..ef8db81fd4 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java @@ -32,9 +32,10 @@ import com.apple.foundationdb.FDBException; import com.apple.foundationdb.Transaction; public class WatchTest { + public static final int API_VERSION = 720; public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database database = fdb.open(args[0])) { database.options().setLocationCacheSize(42); try(Transaction tr = database.createTransaction()) { diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 4f5022aa4e..af281a7405 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -75,38 +75,3 @@ add_custom_command(OUTPUT ${package_file} add_custom_target(python_package DEPENDS ${package_file}) add_dependencies(python_package python_binding) add_dependencies(packages python_package) - -if (NOT WIN32 AND NOT OPEN_FOR_IDE) - add_fdbclient_test( - NAME single_process_fdbcli_tests - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - ) - add_fdbclient_test( - NAME multi_process_fdbcli_tests - PROCESS_NUMBER 5 - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - 5 - ) - if (TARGET external_client) # external_client copies fdb_c to bindings/c/libfdb_c_external.so - add_fdbclient_test( - NAME single_process_external_client_fdbcli_tests - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so - ) - add_fdbclient_test( - NAME multi_process_external_client_fdbcli_tests - PROCESS_NUMBER 5 - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - 5 - --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so - ) - endif() -endif() diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py index e7d1a8bc30..930ad35396 100644 --- a/bindings/python/fdb/__init__.py +++ b/bindings/python/fdb/__init__.py @@ -100,10 +100,8 @@ def api_version(ver): _add_symbols(fdb.impl, list) - if ver >= 710: + if ver >= 630: import fdb.tenant_management - if ver < 720: - fdb.tenant_management._tenant_map_prefix = b'\xff\xff/management/tenant_map/' if ver < 610: globals()["init"] = getattr(fdb.impl, "init") diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 51d67e5162..aa967ba25d 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -1359,7 +1359,7 @@ else: except: # The system python on OS X can't find the library installed to /usr/local/lib if SIP is enabled # find_library does find the location in /usr/local/lib, so if the above fails fallback to using it - lib_path = ctypes.util.find_library(capi_name) + lib_path = ctypes.util.find_library("fdb_c") if lib_path is not None: try: _capi = ctypes.CDLL(lib_path) diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index 84c3a46d03..ebe36594a5 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -103,7 +103,7 @@ class FDBTenantList(object): # JSON strings of the tenant metadata @_impl.transactional def _list_tenants_impl(tr, begin, end, limit): - tr.options.set_read_system_keys() + tr.options.set_raw_access() begin_key = b'%s%s' % (_tenant_map_prefix, begin) end_key = b'%s%s' % (_tenant_map_prefix, end) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 78de24355d..786126359b 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -198,16 +198,17 @@ function(stage_correctness_package) set(src_dir "${src_dir}/") string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir) string(SUBSTRING ${file} ${dir_len} -1 rel_out_file) - set(out_file ${STAGE_OUT_DIR}/${rel_out_file}) + set(out_file ${STAGE_OUT_DIR}/${rel_out_file}) list(APPEND external_files ${out_file}) - add_custom_command( + add_custom_command( OUTPUT ${out_file} - DEPENDS ${file} - COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file} - COMMENT "Copying ${STAGE_CONTEXT} external file ${file}" - ) + DEPENDS ${file} + COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file} + COMMENT "Copying ${STAGE_CONTEXT} external file ${file}" + ) endforeach() endforeach() + list(APPEND package_files ${STAGE_OUT_DIR}/bin/fdbserver ${STAGE_OUT_DIR}/bin/coverage.fdbserver.xml ${STAGE_OUT_DIR}/bin/coverage.fdbclient.xml @@ -217,6 +218,7 @@ function(stage_correctness_package) ${STAGE_OUT_DIR}/bin/TraceLogHelper.dll ${STAGE_OUT_DIR}/CMakeCache.txt ) + add_custom_command( OUTPUT ${package_files} DEPENDS ${CMAKE_BINARY_DIR}/CMakeCache.txt @@ -238,6 +240,20 @@ function(stage_correctness_package) ${STAGE_OUT_DIR}/bin COMMENT "Copying files for ${STAGE_CONTEXT} package" ) + + set(test_harness_dir "${CMAKE_SOURCE_DIR}/contrib/TestHarness2") + file(GLOB_RECURSE test_harness2_files RELATIVE "${test_harness_dir}" CONFIGURE_DEPENDS "${test_harness_dir}/*.py") + foreach(file IN LISTS test_harness2_files) + set(src_file "${test_harness_dir}/${file}") + set(out_file "${STAGE_OUT_DIR}/${file}") + get_filename_component(dir "${out_file}" DIRECTORY) + file(MAKE_DIRECTORY "${dir}") + add_custom_command(OUTPUT ${out_file} + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${out_file}" + DEPENDS "${src_file}") + list(APPEND package_files "${out_file}") + endforeach() + list(APPEND package_files ${test_files} ${external_files}) if(STAGE_OUT_FILES) set(${STAGE_OUT_FILES} ${package_files} PARENT_SCOPE) @@ -449,7 +465,11 @@ function(add_fdbclient_test) set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT}) else() # default timeout - set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + if(USE_SANITIZER) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 1200) + else() + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + endif() endif() set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") endfunction() diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake index de5560439e..47d3fb2996 100644 --- a/cmake/CompileBoost.cmake +++ b/cmake/CompileBoost.cmake @@ -36,7 +36,7 @@ function(compile_boost) set(B2_COMMAND "./b2") set(BOOST_COMPILER_FLAGS -fvisibility=hidden -fPIC -std=c++17 -w) set(BOOST_LINK_FLAGS "") - if(APPLE OR CLANG OR ICX OR USE_LIBCXX) + if(APPLE OR ICX OR USE_LIBCXX) list(APPEND BOOST_COMPILER_FLAGS -stdlib=libc++ -nostdlib++) list(APPEND BOOST_LINK_FLAGS -lc++ -lc++abi) if (NOT APPLE) @@ -57,19 +57,27 @@ function(compile_boost) # Build boost include(ExternalProject) + set(BOOST_INSTALL_DIR "${CMAKE_BINARY_DIR}/boost_install") ExternalProject_add("${COMPILE_BOOST_TARGET}Project" - URL "https://boostorg.jfrog.io/artifactory/main/release/1.78.0/source/boost_1_78_0.tar.bz2" - URL_HASH SHA256=8681f175d4bdb26c52222665793eef08490d7758529330f98d3b29dd0735bccc - CONFIGURE_COMMAND ${BOOTSTRAP_COMMAND} ${BOOTSTRAP_ARGS} --with-libraries=${BOOTSTRAP_LIBRARIES} --with-toolset=${BOOST_TOOLSET} - BUILD_COMMAND ${B2_COMMAND} link=static ${COMPILE_BOOST_BUILD_ARGS} --prefix=${BOOST_INSTALL_DIR} ${USER_CONFIG_FLAG} install - BUILD_IN_SOURCE ON - INSTALL_COMMAND "" - UPDATE_COMMAND "" - BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp" - "${BOOST_INSTALL_DIR}/lib/libboost_context.a" - "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a" - "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a") + URL "https://boostorg.jfrog.io/artifactory/main/release/1.78.0/source/boost_1_78_0.tar.bz2" + URL_HASH SHA256=8681f175d4bdb26c52222665793eef08490d7758529330f98d3b29dd0735bccc + CONFIGURE_COMMAND ${BOOTSTRAP_COMMAND} + ${BOOTSTRAP_ARGS} + --with-libraries=${BOOTSTRAP_LIBRARIES} + --with-toolset=${BOOST_TOOLSET} + BUILD_COMMAND ${B2_COMMAND} + link=static + ${COMPILE_BOOST_BUILD_ARGS} + --prefix=${BOOST_INSTALL_DIR} + ${USER_CONFIG_FLAG} install + BUILD_IN_SOURCE ON + INSTALL_COMMAND "" + UPDATE_COMMAND "" + BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp" + "${BOOST_INSTALL_DIR}/lib/libboost_context.a" + "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a" + "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a") add_library(${COMPILE_BOOST_TARGET}_context STATIC IMPORTED) add_dependencies(${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}Project) @@ -133,7 +141,7 @@ if(WIN32) return() endif() -find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem CONFIG PATHS ${BOOST_HINT_PATHS}) +find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem iostreams CONFIG PATHS ${BOOST_HINT_PATHS}) set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores any installed boost") if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND Boost_iostreams_FOUND AND NOT FORCE_BOOST_BUILD) diff --git a/cmake/CompileRocksDB.cmake b/cmake/CompileRocksDB.cmake index 4634e57e7c..3fdea389ab 100644 --- a/cmake/CompileRocksDB.cmake +++ b/cmake/CompileRocksDB.cmake @@ -4,30 +4,42 @@ find_package(RocksDB 6.27.3) include(ExternalProject) -if (RocksDB_FOUND) +set(RocksDB_CMAKE_ARGS + -DUSE_RTTI=1 + -DPORTABLE=${PORTABLE_ROCKSDB} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS} + -DCMAKE_STATIC_LINKER_FLAGS=${CMAKE_STATIC_LINKER_FLAGS} + -DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DFAIL_ON_WARNINGS=OFF + -DWITH_GFLAGS=OFF + -DWITH_TESTS=OFF + -DWITH_TOOLS=OFF + -DWITH_CORE_TOOLS=OFF + -DWITH_BENCHMARK_TOOLS=OFF + -DWITH_BZ2=OFF + -DWITH_LZ4=ON + -DWITH_SNAPPY=OFF + -DWITH_ZLIB=OFF + -DWITH_ZSTD=OFF + -DWITH_LIBURING=${WITH_LIBURING} + -DWITH_TSAN=${USE_TSAN} + -DWITH_ASAN=${USE_ASAN} + -DWITH_UBSAN=${USE_UBSAN} + -DROCKSDB_BUILD_SHARED=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=True +) + +if(ROCKSDB_FOUND) ExternalProject_Add(rocksdb SOURCE_DIR "${RocksDB_ROOT}" DOWNLOAD_COMMAND "" - CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB} - -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_TOOLS=OFF - -DWITH_CORE_TOOLS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_BZ2=OFF - -DWITH_LZ4=ON - -DWITH_SNAPPY=OFF - -DWITH_ZLIB=OFF - -DWITH_ZSTD=OFF - -DWITH_LIBURING=${WITH_LIBURING} - -DWITH_TSAN=${USE_TSAN} - -DWITH_ASAN=${USE_ASAN} - -DWITH_UBSAN=${USE_UBSAN} - -DROCKSDB_BUILD_SHARED=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=True + CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" ) @@ -37,28 +49,9 @@ if (RocksDB_FOUND) ${BINARY_DIR}/librocksdb.a) else() ExternalProject_Add(rocksdb - URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz - URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 - CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB} - -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_TOOLS=OFF - -DWITH_CORE_TOOLS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_BZ2=OFF - -DWITH_LZ4=ON - -DWITH_SNAPPY=OFF - -DWITH_ZLIB=OFF - -DWITH_ZSTD=OFF - -DWITH_LIBURING=${WITH_LIBURING} - -DWITH_TSAN=${USE_TSAN} - -DWITH_ASAN=${USE_ASAN} - -DWITH_UBSAN=${USE_UBSAN} - -DROCKSDB_BUILD_SHARED=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=True + URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz + URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 + CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" ) @@ -68,7 +61,7 @@ else() ${BINARY_DIR}/librocksdb.a) ExternalProject_Get_Property(rocksdb SOURCE_DIR) - set (ROCKSDB_INCLUDE_DIR "${SOURCE_DIR}/include") + set(ROCKSDB_INCLUDE_DIR "${SOURCE_DIR}/include") set(ROCKSDB_FOUND TRUE) endif() diff --git a/cmake/CompileZstd.cmake b/cmake/CompileZstd.cmake new file mode 100644 index 0000000000..94e1fdb0ff --- /dev/null +++ b/cmake/CompileZstd.cmake @@ -0,0 +1,27 @@ +# Compile zstd + +function(compile_zstd) + + include(FetchContent) + + FetchContent_Declare(ZSTD + GIT_REPOSITORY https://github.com/facebook/zstd.git + GIT_TAG v1.5.2 + SOURCE_SUBDIR "build/cmake" + ) + + FetchContent_GetProperties(ZSTD) + if (NOT zstd_POPULATED) + FetchContent_Populate(ZSTD) + + add_subdirectory(${zstd_SOURCE_DIR}/build/cmake ${zstd_BINARY_DIR}) + + if (CLANG) + target_compile_options(zstd PRIVATE -Wno-array-bounds -Wno-tautological-compare) + target_compile_options(libzstd_static PRIVATE -Wno-array-bounds -Wno-tautological-compare) + target_compile_options(zstd-frugal PRIVATE -Wno-array-bounds -Wno-tautological-compare) + endif() + endif() + + set(ZSTD_LIB_INCLUDE_DIR ${zstd_SOURCE_DIR}/lib CACHE INTERNAL ZSTD_LIB_INCLUDE_DIR) +endfunction(compile_zstd) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 493153f259..e38f333b58 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -25,6 +25,7 @@ env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstd env_set(TRACE_PC_GUARD_INSTRUMENTATION_LIB "" STRING "Path to a library containing an implementation for __sanitizer_cov_trace_pc_guard. See https://clang.llvm.org/docs/SanitizerCoverage.html for more info.") env_set(PROFILE_INSTR_GENERATE OFF BOOL "If set, build FDB as an instrumentation build to generate profiles") env_set(PROFILE_INSTR_USE "" STRING "If set, build FDB with profile") +env_set(FULL_DEBUG_SYMBOLS OFF BOOL "Generate full debug symbols") set(USE_SANITIZER OFF) if(USE_ASAN OR USE_VALGRIND OR USE_MSAN OR USE_TSAN OR USE_UBSAN) @@ -164,9 +165,20 @@ else() set(SANITIZER_COMPILE_OPTIONS) set(SANITIZER_LINK_OPTIONS) - # we always compile with debug symbols. CPack will strip them out + # we always compile with debug symbols. For release builds CPack will strip them out # and create a debuginfo rpm - add_compile_options(-ggdb -fno-omit-frame-pointer) + add_compile_options(-fno-omit-frame-pointer -gz) + add_link_options(-gz) + if(FDB_RELEASE OR FULL_DEBUG_SYMBOLS OR CMAKE_BUILD_TYPE STREQUAL "Debug") + # Configure with FULL_DEBUG_SYMBOLS=ON to generate all symbols for debugging with gdb + # Also generating full debug symbols in release builds, because they are packaged + # separately and installed optionally + add_compile_options(-ggdb) + else() + # Generating minimal debug symbols by default. They are sufficient for testing purposes + add_compile_options(-ggdb1) + endif() + if(TRACE_PC_GUARD_INSTRUMENTATION_LIB) add_compile_options(-fsanitize-coverage=trace-pc-guard) link_libraries(${TRACE_PC_GUARD_INSTRUMENTATION_LIB}) @@ -201,6 +213,8 @@ else() -fsanitize=undefined # TODO(atn34) Re-enable -fsanitize=alignment once https://github.com/apple/foundationdb/issues/1434 is resolved -fno-sanitize=alignment + # https://github.com/apple/foundationdb/issues/7955 + -fno-sanitize=function -DBOOST_USE_UCONTEXT) list(APPEND SANITIZER_LINK_OPTIONS -fsanitize=undefined) endif() @@ -278,16 +292,35 @@ else() #add_compile_options(-fno-builtin-memcpy) if (CLANG OR ICX) - add_compile_options() if (APPLE OR USE_LIBCXX) - add_compile_options($<$:-stdlib=libc++>) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") if (NOT APPLE) if (STATIC_LINK_LIBCXX) - add_link_options(-static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic") + else() + # Make sure that libc++ can be found be the platform's loader, so that thing's like cmake's "try_run" work. + find_library(LIBCXX_SO_PATH c++ /usr/local/lib) + if (LIBCXX_SO_PATH) + get_filename_component(LIBCXX_SO_DIR ${LIBCXX_SO_PATH} DIRECTORY) + if (APPLE) + set(ENV{DYLD_LIBRARY_PATH} "$ENV{DYLD_LIBRARY_PATH}:${LIBCXX_SO_DIR}") + elseif(WIN32) + set(ENV{PATH} "$ENV{PATH};${LIBCXX_SO_DIR}") + else() + set(ENV{LD_LIBRARY_PATH} "$ENV{LD_LIBRARY_PATH}:${LIBCXX_SO_DIR}") + endif() + endif() endif() - add_link_options(-stdlib=libc++ -Wl,-build-id=sha1) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -Wl,-build-id=sha1") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++ -Wl,-build-id=sha1") endif() endif() + if (NOT APPLE AND NOT USE_LIBCXX) + message(STATUS "Linking libatomic") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -latomic") + endif() if (OPEN_FOR_IDE) add_compile_options( -Wno-unknown-attributes) @@ -305,11 +338,19 @@ else() -Wno-unknown-warning-option -Wno-unused-parameter -Wno-constant-logical-operand + # These need to be disabled for FDB's RocksDB storage server implementation + -Wno-deprecated-copy + -Wno-delete-non-abstract-non-virtual-dtor + -Wno-range-loop-construct + -Wno-reorder-ctor + # Needed for clang 13 (todo: Update above logic so that it figures out when to pass in -static-libstdc++ and when it will be ignored) + # When you remove this, you might need to move it back to the USE_CCACHE stanza. It was (only) there before I moved it here. + -Wno-unused-command-line-argument ) if (USE_CCACHE) add_compile_options( -Wno-register - -Wno-unused-command-line-argument) + ) endif() if (PROFILE_INSTR_GENERATE) add_compile_options(-fprofile-instr-generate) diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index 02f0aa2a16..208ac2c3e3 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -178,7 +178,7 @@ set(PORTABLE_ROCKSDB ON CACHE BOOL "Compile RocksDB in portable mode") # Set thi set(WITH_LIBURING OFF CACHE BOOL "Build with liburing enabled") # Set this to ON to include liburing # RocksDB is currently enabled by default for GCC but does not build with the latest # Clang. -if (SSD_ROCKSDB_EXPERIMENTAL AND GCC) +if (SSD_ROCKSDB_EXPERIMENTAL AND NOT WIN32) set(WITH_ROCKSDB_EXPERIMENTAL ON) else() set(WITH_ROCKSDB_EXPERIMENTAL OFF) @@ -200,6 +200,9 @@ else() URL "https://github.com/ToruNiina/toml11/archive/v3.4.0.tar.gz" URL_HASH SHA256=bc6d733efd9216af8c119d8ac64a805578c79cc82b813e4d1d880ca128bd154d CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER} -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}/toml11 -Dtoml11_BUILD_TEST:BOOL=OFF BUILD_ALWAYS ON) diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index bfdd2f5898..8d04ebccca 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -14,7 +14,7 @@ ExternalProject_add(Jemalloc_project BUILD_BYPRODUCTS "${JEMALLOC_DIR}/include/jemalloc/jemalloc.h" "${JEMALLOC_DIR}/lib/libjemalloc.a" "${JEMALLOC_DIR}/lib/libjemalloc_pic.a" - CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof + CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof BUILD_IN_SOURCE ON BUILD_COMMAND make INSTALL_DIR "${JEMALLOC_DIR}" @@ -24,4 +24,4 @@ add_dependencies(im_jemalloc_pic Jemalloc_project) set_target_properties(im_jemalloc_pic PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc_pic.a") set_target_properties(im_jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc.a") target_include_directories(jemalloc INTERFACE "${JEMALLOC_DIR}/include") -target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) \ No newline at end of file +target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) diff --git a/cmake/awssdk.cmake b/cmake/awssdk.cmake index 88cb7c78e9..ab62f9b6d6 100644 --- a/cmake/awssdk.cmake +++ b/cmake/awssdk.cmake @@ -2,16 +2,14 @@ project(awssdk-download NONE) # Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang set(AWSSDK_COMPILER_FLAGS "") -set(AWSSDK_LINK_FLAGS "") -if(APPLE OR CLANG OR USE_LIBCXX) - set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++) - set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi) +if(APPLE OR USE_LIBCXX) + set(AWSSDK_COMPILER_FLAGS "-stdlib=libc++ -nostdlib++") endif() include(ExternalProject) ExternalProject_Add(awssdk_project GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git - GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200 + GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331 SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build" GIT_CONFIG advice.detachedHead=false @@ -21,11 +19,11 @@ ExternalProject_Add(awssdk_project -DSIMPLE_INSTALL=ON -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path -DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own - + -DBUILD_CURL=ON + -DBUILD_ZLIB=ON -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS} - -DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS} + -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS} TEST_COMMAND "" # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a" @@ -35,11 +33,14 @@ ExternalProject_Add(awssdk_project "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a" ) add_library(awssdk_core STATIC IMPORTED) @@ -75,6 +76,10 @@ add_library(awssdk_c_io STATIC IMPORTED) add_dependencies(awssdk_c_io awssdk_project) set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a") +add_library(awssdk_c_sdkutils STATIC IMPORTED) +add_dependencies(awssdk_c_sdkutils awssdk_project) +set_target_properties(awssdk_c_sdkutils PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a") + add_library(awssdk_checksums STATIC IMPORTED) add_dependencies(awssdk_checksums awssdk_project) set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a") @@ -91,7 +96,15 @@ add_library(awssdk_c_common STATIC IMPORTED) add_dependencies(awssdk_c_common awssdk_project) set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a") +add_library(curl STATIC IMPORTED) +add_dependencies(curl awssdk_project) +set_property(TARGET curl PROPERTY IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a") + +add_library(zlib STATIC IMPORTED) +add_dependencies(zlib awssdk_project) +set_property(TARGET zlib PROPERTY IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a") + # link them all together in one interface target add_library(awssdk_target INTERFACE) target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include) -target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl) \ No newline at end of file +target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_sdkutils awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl zlib) diff --git a/cmake/user-config.jam.cmake b/cmake/user-config.jam.cmake index 6d2883cc95..696bcdd831 100644 --- a/cmake/user-config.jam.cmake +++ b/cmake/user-config.jam.cmake @@ -1 +1,2 @@ using @BOOST_TOOLSET@ : : @BOOST_CXX_COMPILER@ : @BOOST_ADDITIONAL_COMPILE_OPTIONS@ ; +using zstd : 1.5.2 : /@CMAKE_BINARY_DIR@/zstd/lib /@CMAKE_BINARY_DIR@/lib ; diff --git a/contrib/Joshua/scripts/bindingTestScript.sh b/contrib/Joshua/scripts/bindingTestScript.sh index f4e0e8eb8b..2d6badbe9d 100755 --- a/contrib/Joshua/scripts/bindingTestScript.sh +++ b/contrib/Joshua/scripts/bindingTestScript.sh @@ -83,6 +83,7 @@ fi # Stop the cluster if stopCluster; then unset FDBSERVERID + trap - EXIT fi exit "${status}" diff --git a/contrib/Joshua/scripts/correctnessTest.sh b/contrib/Joshua/scripts/correctnessTest.sh index a617d81088..bee09acf25 100755 --- a/contrib/Joshua/scripts/correctnessTest.sh +++ b/contrib/Joshua/scripts/correctnessTest.sh @@ -4,4 +4,6 @@ export ASAN_OPTIONS="detect_leaks=0" OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" -mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false +#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false + +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} diff --git a/contrib/Joshua/scripts/correctnessTimeout.sh b/contrib/Joshua/scripts/correctnessTimeout.sh index 7917aae591..6bd0bfeee0 100755 --- a/contrib/Joshua/scripts/correctnessTimeout.sh +++ b/contrib/Joshua/scripts/correctnessTimeout.sh @@ -1,4 +1,4 @@ #!/bin/bash -u -for file in `find . -name 'trace*.xml'` ; do - mono ./bin/TestHarness.exe summarize "${file}" summary.xml "" JoshuaTimeout true -done + + +python3 -m test_harness.timeout diff --git a/contrib/Joshua/scripts/localClusterStart.sh b/contrib/Joshua/scripts/localClusterStart.sh index abbf93abc5..500e106339 100644 --- a/contrib/Joshua/scripts/localClusterStart.sh +++ b/contrib/Joshua/scripts/localClusterStart.sh @@ -210,7 +210,7 @@ function stopCluster then # Ensure that process is dead if ! kill -0 "${FDBSERVERID}" 2> /dev/null; then - log "Killed cluster (${FDBSERVERID}) via cli" + log "Killed cluster (${FDBSERVERID}) via cli" "${DEBUGLEVEL}" elif ! kill -9 "${FDBSERVERID}"; then log "Failed to kill FDB Server process (${FDBSERVERID}) via cli or kill command" let status="${status} + 1" diff --git a/contrib/Joshua/scripts/valgrindTest.sh b/contrib/Joshua/scripts/valgrindTest.sh index 5409429691..820750f3b2 100755 --- a/contrib/Joshua/scripts/valgrindTest.sh +++ b/contrib/Joshua/scripts/valgrindTest.sh @@ -1,3 +1,3 @@ #!/bin/sh OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" -mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" true +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --use-valgrind diff --git a/contrib/Joshua/scripts/valgrindTimeout.sh b/contrib/Joshua/scripts/valgrindTimeout.sh index b9d9e7ebad..2224598e43 100755 --- a/contrib/Joshua/scripts/valgrindTimeout.sh +++ b/contrib/Joshua/scripts/valgrindTimeout.sh @@ -1,6 +1,2 @@ #!/bin/bash -u -for file in `find . -name 'trace*.xml'` ; do - for valgrindFile in `find . -name 'valgrind*.xml'` ; do - mono ./bin/TestHarness.exe summarize "${file}" summary.xml "${valgrindFile}" JoshuaTimeout true - done -done +python3 -m test_harness.timeout --use-valgrind diff --git a/contrib/TestHarness/Program.cs b/contrib/TestHarness/Program.cs index 1e6d802082..b3e003dee5 100644 --- a/contrib/TestHarness/Program.cs +++ b/contrib/TestHarness/Program.cs @@ -19,6 +19,7 @@ */ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; @@ -490,6 +491,16 @@ namespace SummarizeTest useValgrind ? "on" : "off"); } + IDictionary data = Environment.GetEnvironmentVariables(); + foreach (DictionaryEntry i in data) + { + string k=(string)i.Key; + string v=(string)i.Value; + if (k.StartsWith("FDB_KNOB")) { + process.StartInfo.EnvironmentVariables[k]=v; + } + } + process.Start(); // SOMEDAY: Do we want to actually do anything with standard output or error? @@ -747,16 +758,28 @@ namespace SummarizeTest AppendToSummary(summaryFileName, xout); } - // Parses the valgrind XML file and returns a list of "what" tags for each error. + static string ParseValgrindStack(XElement stackElement) { + string backtrace = ""; + foreach (XElement frame in stackElement.Elements()) { + backtrace += " " + frame.Element("ip").Value.ToLower(); + } + if (backtrace.Length > 0) { + backtrace = "addr2line -e fdbserver.debug -p -C -f -i" + backtrace; + } + + return backtrace; + } + + // Parses the valgrind XML file and returns a list of error elements. // All errors for which the "kind" tag starts with "Leak" are ignored - static string[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout) + static XElement[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout) { if (!traceToStdout) { Console.WriteLine("Reading vXML file: " + valgrindOutputFileName); } - ISet whats = new HashSet(); + IList errors = new List(); XElement xdoc = XDocument.Load(valgrindOutputFileName).Element("valgrindoutput"); foreach(var elem in xdoc.Elements()) { if (elem.Name != "error") @@ -764,9 +787,29 @@ namespace SummarizeTest string kind = elem.Element("kind").Value; if(kind.StartsWith("Leak")) continue; - whats.Add(elem.Element("what").Value); + + XElement errorElement = new XElement("ValgrindError", + new XAttribute("Severity", (int)Magnesium.Severity.SevError)); + + int num = 1; + string suffix = ""; + foreach (XElement sub in elem.Elements()) { + if (sub.Name == "what") { + errorElement.SetAttributeValue("What", sub.Value); + } else if (sub.Name == "auxwhat") { + suffix = "Aux" + num++; + errorElement.SetAttributeValue("What" + suffix, sub.Value); + } else if (sub.Name == "stack") { + errorElement.SetAttributeValue("Backtrace" + suffix, ParseValgrindStack(sub)); + } else if (sub.Name == "origin") { + errorElement.SetAttributeValue("WhatOrigin", sub.Element("what").Value); + errorElement.SetAttributeValue("BacktraceOrigin", ParseValgrindStack(sub.Element("stack"))); + } + } + + errors.Add(errorElement); } - return whats.ToArray(); + return errors.ToArray(); } delegate IEnumerable parseDelegate(System.IO.Stream stream, string file, @@ -1072,12 +1115,10 @@ namespace SummarizeTest try { // If there are any errors reported "ok" will be set to false - var whats = ParseValgrindOutput(valgrindOutputFileName, traceToStdout); - foreach (var what in whats) + var valgrindErrors = ParseValgrindOutput(valgrindOutputFileName, traceToStdout); + foreach (var vError in valgrindErrors) { - xout.Add(new XElement("ValgrindError", - new XAttribute("Severity", (int)Magnesium.Severity.SevError), - new XAttribute("What", what))); + xout.Add(vError); ok = false; error = true; } diff --git a/contrib/TestHarness2/.gitignore b/contrib/TestHarness2/.gitignore new file mode 100644 index 0000000000..80682f9552 --- /dev/null +++ b/contrib/TestHarness2/.gitignore @@ -0,0 +1,2 @@ +/tmp/ +/venv diff --git a/contrib/TestHarness2/test_harness/__init__.py b/contrib/TestHarness2/test_harness/__init__.py new file mode 100644 index 0000000000..3cb95520ec --- /dev/null +++ b/contrib/TestHarness2/test_harness/__init__.py @@ -0,0 +1,2 @@ +# Currently this file is left intentionally empty. It's main job for now is to indicate that this directory +# should be used as a module. diff --git a/contrib/TestHarness2/test_harness/app.py b/contrib/TestHarness2/test_harness/app.py new file mode 100644 index 0000000000..3e300c6bf4 --- /dev/null +++ b/contrib/TestHarness2/test_harness/app.py @@ -0,0 +1,25 @@ +import argparse +import sys +import traceback + +from test_harness.config import config +from test_harness.run import TestRunner +from test_harness.summarize import SummaryTree + +if __name__ == '__main__': + try: + parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + test_runner = TestRunner() + if not test_runner.run(): + exit(1) + except Exception as e: + _, _, exc_traceback = sys.exc_info() + error = SummaryTree('TestHarnessError') + error.attributes['Severity'] = '40' + error.attributes['ErrorMessage'] = str(e) + error.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + error.dump(sys.stdout) + exit(1) diff --git a/contrib/TestHarness2/test_harness/config.py b/contrib/TestHarness2/test_harness/config.py new file mode 100644 index 0000000000..191fab629d --- /dev/null +++ b/contrib/TestHarness2/test_harness/config.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +import argparse +import collections +import copy +import os +import random +from enum import Enum +from pathlib import Path +from typing import List, Any, OrderedDict, Dict + + +class BuggifyOptionValue(Enum): + ON = 1 + OFF = 2 + RANDOM = 3 + + +class BuggifyOption: + def __init__(self, val: str | None = None): + self.value = BuggifyOptionValue.RANDOM + if val is not None: + v = val.lower() + if v in ['on', '1', 'true']: + self.value = BuggifyOptionValue.ON + elif v in ['off', '0', 'false']: + self.value = BuggifyOptionValue.OFF + elif v in ['random', 'rnd', 'r']: + pass + else: + assert False, 'Invalid value {} -- use true, false, or random'.format(v) + + +class ConfigValue: + def __init__(self, name: str, **kwargs): + self.name = name + self.value = None + self.kwargs = kwargs + if 'default' in self.kwargs: + self.value = self.kwargs['default'] + + def get_arg_name(self) -> str: + if 'long_name' in self.kwargs: + return self.kwargs['long_name'] + else: + return self.name + + def add_to_args(self, parser: argparse.ArgumentParser): + kwargs = copy.copy(self.kwargs) + long_name = self.name + short_name = None + if 'long_name' in kwargs: + long_name = kwargs['long_name'] + del kwargs['long_name'] + if 'short_name' in kwargs: + short_name = kwargs['short_name'] + del kwargs['short_name'] + if 'action' in kwargs and kwargs['action'] in ['store_true', 'store_false']: + del kwargs['type'] + long_name = long_name.replace('_', '-') + if short_name is None: + # line below is useful for debugging + # print('add_argument(\'--{}\', [{{{}}}])'.format(long_name, ', '.join(['\'{}\': \'{}\''.format(k, v) + # for k, v in kwargs.items()]))) + parser.add_argument('--{}'.format(long_name), **kwargs) + else: + # line below is useful for debugging + # print('add_argument(\'-{}\', \'--{}\', [{{{}}}])'.format(short_name, long_name, + # ', '.join(['\'{}\': \'{}\''.format(k, v) + # for k, v in kwargs.items()]))) + parser.add_argument('-{}'.format(short_name), '--{}'.format(long_name), **kwargs) + + def get_value(self, args: argparse.Namespace) -> tuple[str, Any]: + return self.name, args.__getattribute__(self.get_arg_name()) + + +class Config: + """ + This is the central configuration class for test harness. The values in this class are exposed globally through + a global variable test_harness.config.config. This class provides some "magic" to keep test harness flexible. + Each parameter can further be configured using an `_args` member variable which is expected to be a dictionary. + * The value of any variable can be set through the command line. For a variable named `variable_name` we will + by default create a new command line option `--variable-name` (`_` is automatically changed to `-`). This + default can be changed by setting the `'long_name'` property in the `_arg` dict. + * In addition the user can also optionally set a short-name. This can be achieved by setting the `'short_name'` + property in the `_arg` dictionary. + * All additional properties in `_args` are passed to `argparse.add_argument`. + * If the default of a variable is `None` the user should explicitly set the `'type'` property to an appropriate + type. + * In addition to command line flags, all configuration options can also be controlled through environment variables. + By default, `variable-name` can be changed by setting the environment variable `TH_VARIABLE_NAME`. This default + can be changed by setting the `'env_name'` property. + * Test harness comes with multiple executables. Each of these should use the config facility. For this, + `Config.build_arguments` should be called first with the `argparse` parser. Then `Config.extract_args` needs + to be called with the result of `argparse.ArgumentParser.parse_args`. A sample example could look like this: + ``` + parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + ``` + * Changing the default value for all executables might not always be desirable. If it should be only changed for + one executable Config.change_default should be used. + """ + def __init__(self): + self.random = random.Random() + self.cluster_file: str | None = None + self.cluster_file_args = {'short_name': 'C', 'type': str, 'help': 'Path to fdb cluster file', 'required': False, + 'env_name': 'JOSHUA_CLUSTER_FILE'} + self.joshua_dir: str | None = None + self.joshua_dir_args = {'type': str, 'help': 'Where to write FDB data to', 'required': False, + 'env_name': 'JOSHUA_APP_DIR'} + self.stats: str | None = None + self.stats_args = {'type': str, 'help': 'A base64 encoded list of statistics (used to reproduce runs)', + 'required': False} + self.random_seed: int | None = None + self.random_seed_args = {'type': int, + 'help': 'Force given seed given to fdbserver -- mostly useful for debugging', + 'required': False} + self.kill_seconds: int = 30 * 60 + self.kill_seconds_args = {'help': 'Timeout for individual test'} + self.buggify_on_ratio: float = 0.8 + self.buggify_on_ratio_args = {'help': 'Probability that buggify is turned on'} + self.write_run_times = False + self.write_run_times_args = {'help': 'Write back probabilities after each test run', + 'action': 'store_true'} + self.unseed_check_ratio: float = 0.05 + self.unseed_check_ratio_args = {'help': 'Probability for doing determinism check'} + self.test_dirs: List[str] = ['slow', 'fast', 'restarting', 'rare', 'noSim'] + self.test_dirs_args: dict = {'nargs': '*', 'help': 'test_directories to look for files in'} + self.trace_format: str = 'json' + self.trace_format_args = {'choices': ['json', 'xml'], 'help': 'What format fdb should produce'} + self.crash_on_error: bool = True + self.crash_on_error_args = {'long_name': 'no_crash', 'action': 'store_false', + 'help': 'Don\'t crash on first error'} + self.max_warnings: int = 10 + self.max_warnings_args = {'short_name': 'W'} + self.max_errors: int = 10 + self.max_errors_args = {'short_name': 'E'} + self.old_binaries_path: Path = Path('/app/deploy/global_data/oldBinaries/') + self.old_binaries_path_args = {'help': 'Path to the directory containing the old fdb binaries'} + self.tls_plugin_path: Path = Path('/app/deploy/runtime/.tls_5_1/FDBLibTLS.so') + self.tls_plugin_path_args = {'help': 'Path to the tls plugin used for binaries < 5.2.0'} + self.disable_kaio: bool = False + self.use_valgrind: bool = False + self.use_valgrind_args = {'action': 'store_true'} + self.buggify = BuggifyOption('random') + self.buggify_args = {'short_name': 'b', 'choices': ['on', 'off', 'random']} + self.pretty_print: bool = False + self.pretty_print_args = {'short_name': 'P', 'action': 'store_true'} + self.clean_up: bool = True + self.clean_up_args = {'long_name': 'no_clean_up', 'action': 'store_false'} + self.run_dir: Path = Path('tmp') + self.joshua_seed: int = random.randint(0, 2 ** 32 - 1) + self.joshua_seed_args = {'short_name': 's', 'help': 'A random seed', 'env_name': 'JOSHUA_SEED'} + self.print_coverage = False + self.print_coverage_args = {'action': 'store_true'} + self.binary = Path('bin') / ('fdbserver.exe' if os.name == 'nt' else 'fdbserver') + self.binary_args = {'help': 'Path to executable'} + self.hit_per_runs_ratio: int = 20000 + self.hit_per_runs_ratio_args = {'help': 'Maximum test runs before each code probe hit at least once'} + self.output_format: str = 'xml' + self.output_format_args = {'short_name': 'O', 'choices': ['json', 'xml'], + 'help': 'What format TestHarness should produce'} + self.include_test_files: str = r'.*' + self.include_test_files_args = {'help': 'Only consider test files whose path match against the given regex'} + self.exclude_test_files: str = r'.^' + self.exclude_test_files_args = {'help': 'Don\'t consider test files whose path match against the given regex'} + self.include_test_classes: str = r'.*' + self.include_test_classes_args = {'help': 'Only consider tests whose names match against the given regex'} + self.exclude_test_names: str = r'.^' + self.exclude_test_names_args = {'help': 'Don\'t consider tests whose names match against the given regex'} + self.details: bool = False + self.details_args = {'help': 'Print detailed results', 'short_name': 'c', 'action': 'store_true'} + self.success: bool = False + self.success_args = {'help': 'Print successful results', 'action': 'store_true'} + self.cov_include_files: str = r'.*' + self.cov_include_files_args = {'help': 'Only consider coverage traces that originated in files matching regex'} + self.cov_exclude_files: str = r'.^' + self.cov_exclude_files_args = {'help': 'Ignore coverage traces that originated in files matching regex'} + self.max_stderr_bytes: int = 10000 + self.write_stats: bool = True + self.read_stats: bool = True + self.reproduce_prefix: str | None = None + self.reproduce_prefix_args = {'type': str, 'required': False, + 'help': 'When printing the results, prepend this string to the command'} + self._env_names: Dict[str, str] = {} + self._config_map = self._build_map() + self._read_env() + self.random.seed(self.joshua_seed, version=2) + + def change_default(self, attr: str, default_val): + assert attr in self._config_map, 'Unknown config attribute {}'.format(attr) + self.__setattr__(attr, default_val) + self._config_map[attr].kwargs['default'] = default_val + + def _get_env_name(self, var_name: str) -> str: + return self._env_names.get(var_name, 'TH_{}'.format(var_name.upper())) + + def dump(self): + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or callable(obj) or attr.endswith('_args'): + continue + print('config.{}: {} = {}'.format(attr, type(obj), obj)) + + def _build_map(self) -> OrderedDict[str, ConfigValue]: + config_map: OrderedDict[str, ConfigValue] = collections.OrderedDict() + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or callable(obj): + continue + if attr.endswith('_args'): + name = attr[0:-len('_args')] + assert name in config_map + assert isinstance(obj, dict) + for k, v in obj.items(): + if k == 'env_name': + self._env_names[name] = v + else: + config_map[name].kwargs[k] = v + else: + # attribute_args has to be declared after the attribute + assert attr not in config_map + val_type = type(obj) + kwargs = {'type': val_type, 'default': obj} + config_map[attr] = ConfigValue(attr, **kwargs) + return config_map + + def _read_env(self): + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or attr.endswith('_args') or callable(obj): + continue + env_name = self._get_env_name(attr) + attr_type = self._config_map[attr].kwargs['type'] + assert type(None) != attr_type + e = os.getenv(env_name) + if e is not None: + # Use the env var to supply the default value, so that if the + # environment variable is set and the corresponding command line + # flag is not, the environment variable has an effect. + self._config_map[attr].kwargs['default'] = attr_type(e) + + def build_arguments(self, parser: argparse.ArgumentParser): + for val in self._config_map.values(): + val.add_to_args(parser) + + def extract_args(self, args: argparse.Namespace): + for val in self._config_map.values(): + k, v = val.get_value(args) + if v is not None: + config.__setattr__(k, v) + self.random.seed(self.joshua_seed, version=2) + + +config = Config() + +if __name__ == '__main__': + # test the config setup + parser = argparse.ArgumentParser('TestHarness Config Tester', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + config.dump() diff --git a/contrib/TestHarness2/test_harness/fdb.py b/contrib/TestHarness2/test_harness/fdb.py new file mode 100644 index 0000000000..1e6afa3906 --- /dev/null +++ b/contrib/TestHarness2/test_harness/fdb.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +from typing import OrderedDict, Tuple, List + +import collections +import fdb +import fdb.tuple +import struct + +from test_harness.run import StatFetcher, TestDescription +from test_harness.config import config +from test_harness.summarize import SummaryTree, Coverage + +# Before increasing this, make sure that all Joshua clusters (at Apple and Snowflake) have been upgraded. +# This version needs to be changed if we either need newer features from FDB or the current API version is +# getting retired. +fdb.api_version(630) + + +def str_to_tuple(s: str | None): + if s is None: + return s + return tuple(s.split(',')) + + +fdb_db = None + + +def open_db(cluster_file: str | None): + global fdb_db + if fdb_db is None: + fdb_db = fdb.open(cluster_file) + return fdb_db + + +def chunkify(iterable, sz: int): + res = [] + for item in iterable: + res.append(item) + if len(res) >= sz: + yield res + res = [] + if len(res) > 0: + yield res + + +@fdb.transactional +def write_coverage_chunk(tr, path: Tuple[str, ...], metadata: Tuple[str, ...], + coverage: List[Tuple[Coverage, bool]], initialized: bool) -> bool: + cov_dir = fdb.directory.create_or_open(tr, path) + if not initialized: + metadata_dir = fdb.directory.create_or_open(tr, metadata) + v = tr[metadata_dir['initialized']] + initialized = v.present() + for cov, covered in coverage: + if not initialized or covered: + tr.add(cov_dir.pack((cov.file, cov.line, cov.comment)), struct.pack(' OrderedDict[Coverage, int]: + res = collections.OrderedDict() + cov_dir = fdb.directory.create_or_open(tr, cov_path) + for k, v in tr[cov_dir.range()]: + file, line, comment = cov_dir.unpack(k) + count = struct.unpack(' OrderedDict[Coverage, int]: + db = open_db(cluster_file) + return _read_coverage(db, cov_path) + + +class TestStatistics: + def __init__(self, runtime: int, run_count: int): + self.runtime: int = runtime + self.run_count: int = run_count + + +class Statistics: + def __init__(self, cluster_file: str | None, joshua_dir: Tuple[str, ...]): + self.db = open_db(cluster_file) + self.stats_dir = self.open_stats_dir(self.db, joshua_dir) + self.stats: OrderedDict[str, TestStatistics] = self.read_stats_from_db(self.db) + + @fdb.transactional + def open_stats_dir(self, tr, app_dir: Tuple[str]): + stats_dir = app_dir + ('runtime_stats',) + return fdb.directory.create_or_open(tr, stats_dir) + + @fdb.transactional + def read_stats_from_db(self, tr) -> OrderedDict[str, TestStatistics]: + result = collections.OrderedDict() + for k, v in tr[self.stats_dir.range()]: + test_name = self.stats_dir.unpack(k)[0] + runtime, run_count = struct.unpack(' None: + key = self.stats_dir.pack((test_name,)) + tr.add(key, struct.pack(' None: + assert self.db is not None + self._write_runtime(self.db, test_name, time) + + +class FDBStatFetcher(StatFetcher): + def __init__(self, tests: OrderedDict[str, TestDescription], + joshua_dir: Tuple[str] = str_to_tuple(config.joshua_dir)): + super().__init__(tests) + self.statistics = Statistics(config.cluster_file, joshua_dir) + + def read_stats(self): + for k, v in self.statistics.stats.items(): + if k in self.tests.keys(): + self.tests[k].total_runtime = v.runtime + self.tests[k].num_runs = v.run_count + + def add_run_time(self, test_name: str, runtime: int, out: SummaryTree): + self.statistics.write_runtime(test_name, runtime) + super().add_run_time(test_name, runtime, out) diff --git a/contrib/TestHarness2/test_harness/joshua.py b/contrib/TestHarness2/test_harness/joshua.py new file mode 100644 index 0000000000..33c5881dcc --- /dev/null +++ b/contrib/TestHarness2/test_harness/joshua.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +import collections +import io +import sys +import xml.sax +import xml.sax.handler +from pathlib import Path +from typing import List, OrderedDict, Set + +from joshua import joshua_model + +import test_harness.run +from test_harness.config import config +from test_harness.summarize import SummaryTree + + +class ToSummaryTree(xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.root: SummaryTree | None = None + self.stack: List[SummaryTree] = [] + + def result(self) -> SummaryTree: + assert len(self.stack) == 0 and self.root is not None, 'Parse Error' + return self.root + + def startElement(self, name, attrs): + new_child = SummaryTree(name) + for k, v in attrs.items(): + new_child.attributes[k] = v + self.stack.append(new_child) + + def endElement(self, name): + closed = self.stack.pop() + assert closed.name == name + if len(self.stack) == 0: + self.root = closed + else: + self.stack[-1].children.append(closed) + + +def _print_summary(summary: SummaryTree, commands: Set[str]): + cmd = [] + if config.reproduce_prefix is not None: + cmd.append(config.reproduce_prefix) + cmd.append('fdbserver') + if 'TestFile' in summary.attributes: + file_name = summary.attributes['TestFile'] + role = 'test' if test_harness.run.is_no_sim(Path(file_name)) else 'simulation' + cmd += ['-r', role, '-f', file_name] + else: + cmd += ['-r', 'simulation', '-f', ''] + if 'RandomSeed' in summary.attributes: + cmd += ['-s', summary.attributes['RandomSeed']] + else: + cmd += ['-s', ''] + if 'BuggifyEnabled' in summary.attributes: + arg = 'on' + if summary.attributes['BuggifyEnabled'].lower() in ['0', 'off', 'false']: + arg = 'off' + cmd += ['-b', arg] + else: + cmd += ['b', ''] + cmd += ['--crash', '--trace_format', config.trace_format] + key = ' '.join(cmd) + count = 1 + while key in commands: + key = '{} # {}'.format(' '.join(cmd), count) + count += 1 + # we want the command as the first attribute + attributes = {'Command': ' '.join(cmd)} + for k, v in summary.attributes.items(): + if k == 'Errors': + attributes['ErrorCount'] = v + else: + attributes[k] = v + summary.attributes = attributes + if config.details: + key = str(len(commands)) + str_io = io.StringIO() + summary.dump(str_io, prefix=(' ' if config.pretty_print else '')) + if config.output_format == 'json': + sys.stdout.write('{}"Test{}": {}'.format(' ' if config.pretty_print else '', + key, str_io.getvalue())) + else: + sys.stdout.write(str_io.getvalue()) + if config.pretty_print: + sys.stdout.write('\n' if config.output_format == 'xml' else ',\n') + return key + error_count = 0 + warning_count = 0 + small_summary = SummaryTree('Test') + small_summary.attributes = attributes + errors = SummaryTree('Errors') + warnings = SummaryTree('Warnings') + buggifies: OrderedDict[str, List[int]] = collections.OrderedDict() + for child in summary.children: + if 'Severity' in child.attributes and child.attributes['Severity'] == '40' and error_count < config.max_errors: + error_count += 1 + errors.append(child) + if 'Severity' in child.attributes and child.attributes[ + 'Severity'] == '30' and warning_count < config.max_warnings: + warning_count += 1 + warnings.append(child) + if child.name == 'BuggifySection': + file = child.attributes['File'] + line = int(child.attributes['Line']) + buggifies.setdefault(file, []).append(line) + buggifies_elem = SummaryTree('Buggifies') + for file, lines in buggifies.items(): + lines.sort() + if config.output_format == 'json': + buggifies_elem.attributes[file] = ' '.join(str(line) for line in lines) + else: + child = SummaryTree('Buggify') + child.attributes['File'] = file + child.attributes['Lines'] = ' '.join(str(line) for line in lines) + small_summary.append(child) + small_summary.children.append(buggifies_elem) + if len(errors.children) > 0: + small_summary.children.append(errors) + if len(warnings.children) > 0: + small_summary.children.append(warnings) + output = io.StringIO() + small_summary.dump(output, prefix=(' ' if config.pretty_print else '')) + if config.output_format == 'json': + sys.stdout.write('{}"{}": {}'.format(' ' if config.pretty_print else '', key, output.getvalue().strip())) + else: + sys.stdout.write('{}{}'.format(' ' if config.pretty_print else '', output.getvalue().strip())) + sys.stdout.write('\n' if config.output_format == 'xml' else ',\n') + + +def print_errors(ensemble_id: str): + joshua_model.open(config.cluster_file) + properties = joshua_model.get_ensemble_properties(ensemble_id) + compressed = properties["compressed"] if "compressed" in properties else False + for rec in joshua_model.tail_results(ensemble_id, errors_only=(not config.success), compressed=compressed): + if len(rec) == 5: + version_stamp, result_code, host, seed, output = rec + elif len(rec) == 4: + version_stamp, result_code, host, output = rec + seed = None + elif len(rec) == 3: + version_stamp, result_code, output = rec + host = None + seed = None + elif len(rec) == 2: + version_stamp, seed = rec + output = str(joshua_model.fdb.tuple.unpack(seed)[0]) + "\n" + result_code = None + host = None + seed = None + else: + raise Exception("Unknown result format") + lines = output.splitlines() + commands: Set[str] = set() + for line in lines: + summary = ToSummaryTree() + xml.sax.parseString(line, summary) + commands.add(_print_summary(summary.result(), commands)) diff --git a/contrib/TestHarness2/test_harness/results.py b/contrib/TestHarness2/test_harness/results.py new file mode 100644 index 0000000000..486c497d35 --- /dev/null +++ b/contrib/TestHarness2/test_harness/results.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import argparse +import io +import json +import re +import sys +import test_harness.fdb + +from typing import List, Tuple, OrderedDict +from test_harness.summarize import SummaryTree, Coverage +from test_harness.config import config +from xml.sax.saxutils import quoteattr + + +class GlobalStatistics: + def __init__(self): + self.total_probes_hit: int = 0 + self.total_cpu_time: int = 0 + self.total_test_runs: int = 0 + self.total_missed_probes: int = 0 + + +class EnsembleResults: + def __init__(self, cluster_file: str | None, ensemble_id: str): + self.global_statistics = GlobalStatistics() + self.fdb_path = ('joshua', 'ensembles', 'results', 'application', ensemble_id) + self.coverage_path = self.fdb_path + ('coverage',) + self.statistics = test_harness.fdb.Statistics(cluster_file, self.fdb_path) + coverage_dict: OrderedDict[Coverage, int] = test_harness.fdb.read_coverage(cluster_file, self.coverage_path) + self.coverage: List[Tuple[Coverage, int]] = [] + self.min_coverage_hit: int | None = None + self.ratio = self.global_statistics.total_test_runs / config.hit_per_runs_ratio + for cov, count in coverage_dict.items(): + if re.search(config.cov_include_files, cov.file) is None: + continue + if re.search(config.cov_exclude_files, cov.file) is not None: + continue + self.global_statistics.total_probes_hit += count + self.coverage.append((cov, count)) + if count <= self.ratio: + self.global_statistics.total_missed_probes += 1 + if self.min_coverage_hit is None or self.min_coverage_hit > count: + self.min_coverage_hit = count + self.coverage.sort(key=lambda x: (x[1], x[0].file, x[0].line)) + self.stats: List[Tuple[str, int, int]] = [] + for k, v in self.statistics.stats.items(): + self.global_statistics.total_test_runs += v.run_count + self.global_statistics.total_cpu_time += v.runtime + self.stats.append((k, v.runtime, v.run_count)) + self.stats.sort(key=lambda x: x[1], reverse=True) + if self.min_coverage_hit is not None: + self.coverage_ok = self.min_coverage_hit > self.ratio + else: + self.coverage_ok = False + + def dump(self, prefix: str): + errors = 0 + out = SummaryTree('EnsembleResults') + out.attributes['TotalRuntime'] = str(self.global_statistics.total_cpu_time) + out.attributes['TotalTestRuns'] = str(self.global_statistics.total_test_runs) + out.attributes['TotalProbesHit'] = str(self.global_statistics.total_probes_hit) + out.attributes['MinProbeHit'] = str(self.min_coverage_hit) + out.attributes['TotalProbes'] = str(len(self.coverage)) + out.attributes['MissedProbes'] = str(self.global_statistics.total_missed_probes) + + for cov, count in self.coverage: + severity = 10 if count > self.ratio else 40 + if severity == 40: + errors += 1 + if (severity == 40 and errors <= config.max_errors) or config.details: + child = SummaryTree('CodeProbe') + child.attributes['Severity'] = str(severity) + child.attributes['File'] = cov.file + child.attributes['Line'] = str(cov.line) + child.attributes['Comment'] = '' if cov.comment is None else cov.comment + child.attributes['HitCount'] = str(count) + out.append(child) + + if config.details: + for k, runtime, run_count in self.stats: + child = SummaryTree('Test') + child.attributes['Name'] = k + child.attributes['Runtime'] = str(runtime) + child.attributes['RunCount'] = str(run_count) + out.append(child) + if errors > 0: + out.attributes['Errors'] = str(errors) + str_io = io.StringIO() + out.dump(str_io, prefix=prefix, new_line=config.pretty_print) + if config.output_format == 'xml': + sys.stdout.write(str_io.getvalue()) + else: + sys.stdout.write('{}"EnsembleResults":{}{}'.format(' ' if config.pretty_print else '', + '\n' if config.pretty_print else ' ', + str_io.getvalue())) + + +def write_header(ensemble_id: str): + if config.output_format == 'json': + if config.pretty_print: + print('{') + print(' "{}": {},\n'.format('ID', json.dumps(ensemble_id.strip()))) + else: + sys.stdout.write('{{{}: {},'.format('ID', json.dumps(ensemble_id.strip()))) + elif config.output_format == 'xml': + sys.stdout.write(''.format(quoteattr(ensemble_id.strip()))) + if config.pretty_print: + sys.stdout.write('\n') + else: + assert False, 'unknown output format {}'.format(config.output_format) + + +def write_footer(): + if config.output_format == 'xml': + sys.stdout.write('\n') + elif config.output_format == 'json': + sys.stdout.write('}\n') + else: + assert False, 'unknown output format {}'.format(config.output_format) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('TestHarness Results', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.change_default('pretty_print', True) + config.change_default('max_warnings', 0) + config.build_arguments(parser) + parser.add_argument('ensemble_id', type=str, help='The ensemble to fetch the result for') + args = parser.parse_args() + config.extract_args(args) + config.output_format = args.output_format + write_header(args.ensemble_id) + try: + import test_harness.joshua + test_harness.joshua.print_errors(args.ensemble_id) + except ModuleNotFoundError: + child = SummaryTree('JoshuaNotFound') + child.attributes['Severity'] = '30' + child.attributes['Message'] = 'Could not import Joshua -- set PYTHONPATH to joshua checkout dir' + child.dump(sys.stdout, prefix=(' ' if config.pretty_print else ''), new_line=config.pretty_print) + results = EnsembleResults(config.cluster_file, args.ensemble_id) + results.dump(' ' if config.pretty_print else '') + write_footer() + exit(0 if results.coverage_ok else 1) diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py new file mode 100644 index 0000000000..2cd24575fb --- /dev/null +++ b/contrib/TestHarness2/test_harness/run.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +import array +import base64 +import collections +import math +import os +import resource +import shutil +import subprocess +import re +import sys +import threading +import time +import uuid + +from functools import total_ordering +from pathlib import Path +from test_harness.version import Version +from test_harness.config import config +from typing import Dict, List, Pattern, OrderedDict + +from test_harness.summarize import Summary, SummaryTree + + +@total_ordering +class TestDescription: + def __init__(self, path: Path, name: str, priority: float): + self.paths: List[Path] = [path] + self.name = name + self.priority: float = priority + # we only measure in seconds. Otherwise, keeping determinism will be difficult + self.total_runtime: int = 0 + self.num_runs: int = 0 + + def __lt__(self, other): + if isinstance(other, TestDescription): + return self.name < other.name + else: + return self.name < str(other) + + def __eq__(self, other): + if isinstance(other, TestDescription): + return self.name < other.name + else: + return self.name < str(other.name) + + +class StatFetcher: + def __init__(self, tests: OrderedDict[str, TestDescription]): + self.tests = tests + + def read_stats(self): + pass + + def add_run_time(self, test_name: str, runtime: int, out: SummaryTree): + self.tests[test_name].total_runtime += runtime + + +class TestPicker: + def __init__(self, test_dir: Path): + if not test_dir.exists(): + raise RuntimeError('{} is neither a directory nor a file'.format(test_dir)) + self.include_files_regex = re.compile(config.include_test_files) + self.exclude_files_regex = re.compile(config.exclude_test_files) + self.include_tests_regex = re.compile(config.include_test_classes) + self.exclude_tests_regex = re.compile(config.exclude_test_names) + self.test_dir: Path = test_dir + self.tests: OrderedDict[str, TestDescription] = collections.OrderedDict() + self.restart_test: Pattern = re.compile(r".*-\d+\.(txt|toml)") + self.follow_test: Pattern = re.compile(r".*-[2-9]\d*\.(txt|toml)") + + for subdir in self.test_dir.iterdir(): + if subdir.is_dir() and subdir.name in config.test_dirs: + self.walk_test_dir(subdir) + self.stat_fetcher: StatFetcher + if config.stats is not None or config.joshua_dir is None: + self.stat_fetcher = StatFetcher(self.tests) + else: + from test_harness.fdb import FDBStatFetcher + self.stat_fetcher = FDBStatFetcher(self.tests) + if config.stats is not None: + self.load_stats(config.stats) + else: + self.fetch_stats() + + def add_time(self, test_file: Path, run_time: int, out: SummaryTree) -> None: + # getting the test name is fairly inefficient. But since we only have 100s of tests, I won't bother + test_name: str | None = None + test_desc: TestDescription | None = None + for name, test in self.tests.items(): + for p in test.paths: + test_files: List[Path] + if self.restart_test.match(p.name): + test_files = self.list_restart_files(p) + else: + test_files = [p] + for file in test_files: + if file.absolute() == test_file.absolute(): + test_name = name + test_desc = test + break + if test_name is not None: + break + if test_name is not None: + break + assert test_name is not None and test_desc is not None + self.stat_fetcher.add_run_time(test_name, run_time, out) + out.attributes['TotalTestTime'] = str(test_desc.total_runtime) + out.attributes['TestRunCount'] = str(test_desc.num_runs) + + def dump_stats(self) -> str: + res = array.array('I') + for _, spec in self.tests.items(): + res.append(spec.total_runtime) + return base64.standard_b64encode(res.tobytes()).decode('utf-8') + + def fetch_stats(self): + self.stat_fetcher.read_stats() + + def load_stats(self, serialized: str): + times = array.array('I') + times.frombytes(base64.standard_b64decode(serialized)) + assert len(times) == len(self.tests.items()) + for idx, (_, spec) in enumerate(self.tests.items()): + spec.total_runtime = times[idx] + + def parse_txt(self, path: Path): + if self.include_files_regex.search(str(path)) is None or self.exclude_files_regex.search(str(path)) is not None: + return + with path.open('r') as f: + test_name: str | None = None + test_class: str | None = None + priority: float | None = None + for line in f: + line = line.strip() + kv = line.split('=') + if len(kv) != 2: + continue + kv[0] = kv[0].strip() + kv[1] = kv[1].strip(' \r\n\t\'"') + if kv[0] == 'testTitle' and test_name is None: + test_name = kv[1] + if kv[0] == 'testClass' and test_class is None: + test_class = kv[1] + if kv[0] == 'testPriority' and priority is None: + try: + priority = float(kv[1]) + except ValueError: + raise RuntimeError("Can't parse {} -- testPriority in {} should be set to a float".format(kv[1], + path)) + if test_name is not None and test_class is not None and priority is not None: + break + if test_name is None: + return + if test_class is None: + test_class = test_name + if priority is None: + priority = 1.0 + if self.include_tests_regex.search(test_class) is None \ + or self.exclude_tests_regex.search(test_class) is not None: + return + if test_class not in self.tests: + self.tests[test_class] = TestDescription(path, test_class, priority) + else: + self.tests[test_class].paths.append(path) + + def walk_test_dir(self, test: Path): + if test.is_dir(): + for file in test.iterdir(): + self.walk_test_dir(file) + else: + # check whether we're looking at a restart test + if self.follow_test.match(test.name) is not None: + return + if test.suffix == '.txt' or test.suffix == '.toml': + self.parse_txt(test) + + @staticmethod + def list_restart_files(start_file: Path) -> List[Path]: + name = re.sub(r'-\d+.(txt|toml)', '', start_file.name) + res: List[Path] = [] + for test_file in start_file.parent.iterdir(): + if test_file.name.startswith(name): + res.append(test_file) + assert len(res) > 1 + res.sort() + return res + + def choose_test(self) -> List[Path]: + min_runtime: float | None = None + candidates: List[TestDescription] = [] + for _, v in self.tests.items(): + this_time = v.total_runtime * v.priority + if min_runtime is None or this_time < min_runtime: + min_runtime = this_time + candidates = [v] + elif this_time == min_runtime: + candidates.append(v) + candidates.sort() + choice = config.random.randint(0, len(candidates) - 1) + test = candidates[choice] + result = test.paths[config.random.randint(0, len(test.paths) - 1)] + if self.restart_test.match(result.name): + return self.list_restart_files(result) + else: + return [result] + + +class OldBinaries: + def __init__(self): + self.first_file_expr = re.compile(r'.*-1\.(txt|toml)') + self.old_binaries_path: Path = config.old_binaries_path + self.binaries: OrderedDict[Version, Path] = collections.OrderedDict() + if not self.old_binaries_path.exists() or not self.old_binaries_path.is_dir(): + return + exec_pattern = re.compile(r'fdbserver-\d+\.\d+\.\d+(\.exe)?') + for file in self.old_binaries_path.iterdir(): + if not file.is_file() or not os.access(file, os.X_OK): + continue + if exec_pattern.fullmatch(file.name) is not None: + self._add_file(file) + + def _add_file(self, file: Path): + version_str = file.name.split('-')[1] + if version_str.endswith('.exe'): + version_str = version_str[0:-len('.exe')] + ver = Version.parse(version_str) + self.binaries[ver] = file + + def choose_binary(self, test_file: Path) -> Path: + if len(self.binaries) == 0: + return config.binary + max_version = Version.max_version() + min_version = Version.parse('5.0.0') + dirs = test_file.parent.parts + if 'restarting' not in dirs: + return config.binary + version_expr = dirs[-1].split('_') + first_file = self.first_file_expr.match(test_file.name) is not None + if first_file and version_expr[0] == 'to': + # downgrade test -- first binary should be current one + return config.binary + if not first_file and version_expr[0] == 'from': + # upgrade test -- we only return an old version for the first test file + return config.binary + if version_expr[0] == 'from' or version_expr[0] == 'to': + min_version = Version.parse(version_expr[1]) + if len(version_expr) == 4 and version_expr[2] == 'until': + max_version = Version.parse(version_expr[3]) + candidates: List[Path] = [] + for ver, binary in self.binaries.items(): + if min_version <= ver <= max_version: + candidates.append(binary) + if len(candidates) == 0: + return config.binary + return config.random.choice(candidates) + + +def is_restarting_test(test_file: Path): + for p in test_file.parts: + if p == 'restarting': + return True + return False + + +def is_no_sim(test_file: Path): + return test_file.parts[-2] == 'noSim' + + +class ResourceMonitor(threading.Thread): + def __init__(self): + super().__init__() + self.start_time = time.time() + self.end_time: float | None = None + self._stop_monitor = False + self.max_rss = 0 + + def run(self) -> None: + while not self._stop_monitor: + time.sleep(1) + resources = resource.getrusage(resource.RUSAGE_CHILDREN) + self.max_rss = max(resources.ru_maxrss, self.max_rss) + + def stop(self): + self.end_time = time.time() + self._stop_monitor = True + + def time(self): + return self.end_time - self.start_time + + +class TestRun: + def __init__(self, binary: Path, test_file: Path, random_seed: int, uid: uuid.UUID, + restarting: bool = False, test_determinism: bool = False, buggify_enabled: bool = False, + stats: str | None = None, expected_unseed: int | None = None, will_restart: bool = False): + self.binary = binary + self.test_file = test_file + self.random_seed = random_seed + self.uid = uid + self.restarting = restarting + self.test_determinism = test_determinism + self.stats: str | None = stats + self.expected_unseed: int | None = expected_unseed + self.use_valgrind: bool = config.use_valgrind + self.old_binary_path: Path = config.old_binaries_path + self.buggify_enabled: bool = buggify_enabled + self.fault_injection_enabled: bool = True + self.trace_format: str | None = config.trace_format + if Version.of_binary(self.binary) < "6.1.0": + self.trace_format = None + self.use_tls_plugin = Version.of_binary(self.binary) < "5.2.0" + self.temp_path = config.run_dir / str(self.uid) + # state for the run + self.retryable_error: bool = False + self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed, + will_restart=will_restart) + self.run_time: int = 0 + self.success = self.run() + + def log_test_plan(self, out: SummaryTree): + test_plan: SummaryTree = SummaryTree('TestPlan') + test_plan.attributes['TestUID'] = str(self.uid) + test_plan.attributes['RandomSeed'] = str(self.random_seed) + test_plan.attributes['TestFile'] = str(self.test_file) + test_plan.attributes['Buggify'] = '1' if self.buggify_enabled else '0' + test_plan.attributes['FaultInjectionEnabled'] = '1' if self.fault_injection_enabled else '0' + test_plan.attributes['DeterminismCheck'] = '1' if self.test_determinism else '0' + out.append(test_plan) + + def delete_simdir(self): + shutil.rmtree(self.temp_path / Path('simfdb')) + + def run(self): + command: List[str] = [] + env: Dict[str, str] = os.environ.copy() + valgrind_file: Path | None = None + if self.use_valgrind and self.binary == config.binary: + # Only run the binary under test under valgrind. There's nothing we + # can do about valgrind errors in old binaries anyway, and it makes + # the test take longer. Also old binaries weren't built with + # USE_VALGRIND=ON, and we have seen false positives with valgrind in + # such binaries. + command.append('valgrind') + valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed)) + dbg_path = os.getenv('FDB_VALGRIND_DBGPATH') + if dbg_path is not None: + command.append('--extra-debuginfo-path={}'.format(dbg_path)) + command += ['--xml=yes', '--xml-file={}'.format(valgrind_file.absolute()), '-q'] + command += [str(self.binary.absolute()), + '-r', 'test' if is_no_sim(self.test_file) else 'simulation', + '-f', str(self.test_file), + '-s', str(self.random_seed)] + if self.trace_format is not None: + command += ['--trace_format', self.trace_format] + if self.use_tls_plugin: + command += ['--tls_plugin', str(config.tls_plugin_path)] + env["FDB_TLS_PLUGIN"] = str(config.tls_plugin_path) + if config.disable_kaio: + command += ['--knob-disable-posix-kernel-aio=1'] + if Version.of_binary(self.binary) >= '7.1.0': + command += ['-fi', 'on' if self.fault_injection_enabled else 'off'] + if self.restarting: + command.append('--restarting') + if self.buggify_enabled: + command += ['-b', 'on'] + if config.crash_on_error: + command.append('--crash') + + self.temp_path.mkdir(parents=True, exist_ok=True) + + # self.log_test_plan(out) + resources = ResourceMonitor() + resources.start() + process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path, + text=True, env=env) + did_kill = False + timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds + err_out: str + try: + _, err_out = process.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + _, err_out = process.communicate() + did_kill = True + resources.stop() + resources.join() + # we're rounding times up, otherwise we will prefer running very short tests (<1s) + self.run_time = math.ceil(resources.time()) + self.summary.runtime = resources.time() + self.summary.max_rss = resources.max_rss + self.summary.was_killed = did_kill + self.summary.valgrind_out_file = valgrind_file + self.summary.error_out = err_out + self.summary.summarize(self.temp_path, ' '.join(command)) + return self.summary.ok() + + +def decorate_summary(out: SummaryTree, test_file: Path, seed: int, buggify: bool): + """Sometimes a test can crash before ProgramStart is written to the traces. These + tests are then hard to reproduce (they can be reproduced through TestHarness but + require the user to run in the joshua docker container). To account for this we + will write the necessary information into the attributes if it is missing.""" + if 'TestFile' not in out.attributes: + out.attributes['TestFile'] = str(test_file) + if 'RandomSeed' not in out.attributes: + out.attributes['RandomSeed'] = str(seed) + if 'BuggifyEnabled' not in out.attributes: + out.attributes['BuggifyEnabled'] = '1' if buggify else '0' + + +class TestRunner: + def __init__(self): + self.uid = uuid.uuid4() + self.test_path: Path = Path('tests') + self.cluster_file: str | None = None + self.fdb_app_dir: str | None = None + self.binary_chooser = OldBinaries() + self.test_picker = TestPicker(self.test_path) + + def backup_sim_dir(self, seed: int): + temp_dir = config.run_dir / str(self.uid) + src_dir = temp_dir / 'simfdb' + assert src_dir.is_dir() + dest_dir = temp_dir / 'simfdb.{}'.format(seed) + assert not dest_dir.exists() + shutil.copytree(src_dir, dest_dir) + + def restore_sim_dir(self, seed: int): + temp_dir = config.run_dir / str(self.uid) + src_dir = temp_dir / 'simfdb.{}'.format(seed) + assert src_dir.exists() + dest_dir = temp_dir / 'simfdb' + shutil.rmtree(dest_dir) + shutil.move(src_dir, dest_dir) + + def run_tests(self, test_files: List[Path], seed: int, test_picker: TestPicker) -> bool: + result: bool = True + for count, file in enumerate(test_files): + will_restart = count + 1 < len(test_files) + binary = self.binary_chooser.choose_binary(file) + unseed_check = not is_no_sim(file) and config.random.random() < config.unseed_check_ratio + buggify_enabled: bool = config.random.random() < config.buggify_on_ratio + if unseed_check and count != 0: + # for restarting tests we will need to restore the sim2 after the first run + self.backup_sim_dir(seed + count - 1) + run = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0, + stats=test_picker.dump_stats(), will_restart=will_restart, buggify_enabled=buggify_enabled) + result = result and run.success + test_picker.add_time(test_files[0], run.run_time, run.summary.out) + decorate_summary(run.summary.out, file, seed + count, run.buggify_enabled) + if unseed_check and run.summary.unseed: + run.summary.out.append(run.summary.list_simfdb()) + run.summary.out.dump(sys.stdout) + if not result: + return False + if unseed_check and run.summary.unseed is not None: + if count != 0: + self.restore_sim_dir(seed + count - 1) + run2 = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0, + stats=test_picker.dump_stats(), expected_unseed=run.summary.unseed, + will_restart=will_restart, buggify_enabled=buggify_enabled) + test_picker.add_time(file, run2.run_time, run.summary.out) + decorate_summary(run2.summary.out, file, seed + count, run.buggify_enabled) + run2.summary.out.dump(sys.stdout) + result = result and run2.success + if not result: + return False + return result + + def run(self) -> bool: + seed = config.random_seed if config.random_seed is not None else config.random.randint(0, 2 ** 32 - 1) + test_files = self.test_picker.choose_test() + success = self.run_tests(test_files, seed, self.test_picker) + if config.clean_up: + shutil.rmtree(config.run_dir / str(self.uid)) + return success diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py new file mode 100644 index 0000000000..54b2f799b5 --- /dev/null +++ b/contrib/TestHarness2/test_harness/summarize.py @@ -0,0 +1,620 @@ +from __future__ import annotations + +import collections +import inspect +import json +import os +import re +import sys +import traceback +import uuid +import xml.sax +import xml.sax.handler +import xml.sax.saxutils + +from pathlib import Path +from typing import List, Dict, TextIO, Callable, Optional, OrderedDict, Any, Tuple, Iterator, Iterable + +from test_harness.config import config +from test_harness.valgrind import parse_valgrind_output + + +class SummaryTree: + def __init__(self, name: str): + self.name = name + self.children: List[SummaryTree] = [] + self.attributes: Dict[str, str] = {} + + def append(self, element: SummaryTree): + self.children.append(element) + + def to_dict(self, add_name: bool = True) -> Dict[str, Any] | List[Any]: + if len(self.children) > 0 and len(self.attributes) == 0: + children = [] + for child in self.children: + children.append(child.to_dict()) + if add_name: + return {self.name: children} + else: + return children + res: Dict[str, Any] = {} + if add_name: + res['Type'] = self.name + for k, v in self.attributes.items(): + res[k] = v + children = [] + child_keys: Dict[str, int] = {} + for child in self.children: + if child.name in child_keys: + child_keys[child.name] += 1 + else: + child_keys[child.name] = 1 + for child in self.children: + if child_keys[child.name] == 1 and child.name not in self.attributes: + res[child.name] = child.to_dict(add_name=False) + else: + children.append(child.to_dict()) + if len(children) > 0: + res['children'] = children + return res + + def to_json(self, out: TextIO, prefix: str = ''): + res = json.dumps(self.to_dict(), indent=(' ' if config.pretty_print else None)) + for line in res.splitlines(False): + out.write('{}{}\n'.format(prefix, line)) + + def to_xml(self, out: TextIO, prefix: str = ''): + # minidom doesn't support omitting the xml declaration which is a problem for joshua + # However, our xml is very simple and therefore serializing manually is easy enough + attrs = [] + print_width = 120 + try: + print_width, _ = os.get_terminal_size() + except OSError: + pass + for k, v in self.attributes.items(): + attrs.append('{}={}'.format(k, xml.sax.saxutils.quoteattr(v))) + elem = '{}<{}{}'.format(prefix, self.name, ('' if len(attrs) == 0 else ' ')) + out.write(elem) + if config.pretty_print: + curr_line_len = len(elem) + for i in range(len(attrs)): + attr_len = len(attrs[i]) + if i == 0 or attr_len + curr_line_len + 1 <= print_width: + if i != 0: + out.write(' ') + out.write(attrs[i]) + curr_line_len += attr_len + else: + out.write('\n') + out.write(' ' * len(elem)) + out.write(attrs[i]) + curr_line_len = len(elem) + attr_len + else: + out.write(' '.join(attrs)) + if len(self.children) == 0: + out.write('/>') + else: + out.write('>') + for child in self.children: + if config.pretty_print: + out.write('\n') + child.to_xml(out, prefix=(' {}'.format(prefix) if config.pretty_print else prefix)) + if len(self.children) > 0: + out.write('{}{}'.format(('\n' if config.pretty_print else ''), prefix, self.name)) + + def dump(self, out: TextIO, prefix: str = '', new_line: bool = True): + if config.output_format == 'json': + self.to_json(out, prefix=prefix) + else: + self.to_xml(out, prefix=prefix) + if new_line: + out.write('\n') + + +ParserCallback = Callable[[Dict[str, str]], Optional[str]] + + +class ParseHandler: + def __init__(self, out: SummaryTree): + self.out = out + self.events: OrderedDict[Optional[Tuple[str, Optional[str]]], List[ParserCallback]] = collections.OrderedDict() + + def add_handler(self, attr: Tuple[str, Optional[str]], callback: ParserCallback) -> None: + self.events.setdefault(attr, []).append(callback) + + def _call(self, callback: ParserCallback, attrs: Dict[str, str]) -> str | None: + try: + return callback(attrs) + except Exception as e: + _, _, exc_traceback = sys.exc_info() + child = SummaryTree('NonFatalParseError') + child.attributes['Severity'] = '30' + child.attributes['ErrorMessage'] = str(e) + child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + self.out.append(child) + return None + + def handle(self, attrs: Dict[str, str]): + if None in self.events: + for callback in self.events[None]: + self._call(callback, attrs) + for k, v in attrs.items(): + if (k, None) in self.events: + for callback in self.events[(k, None)]: + remap = self._call(callback, attrs) + if remap is not None: + v = remap + attrs[k] = v + if (k, v) in self.events: + for callback in self.events[(k, v)]: + remap = self._call(callback, attrs) + if remap is not None: + v = remap + attrs[k] = v + + +class Parser: + def parse(self, file: TextIO, handler: ParseHandler) -> None: + pass + + +class XmlParser(Parser, xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.handler: ParseHandler | None = None + + def parse(self, file: TextIO, handler: ParseHandler) -> None: + xml.sax.parse(file, self) + + def startElement(self, name, attrs) -> None: + attributes: Dict[str, str] = {} + for name in attrs.getNames(): + attributes[name] = attrs.getValue(name) + assert self.handler is not None + self.handler.handle(attributes) + + +class JsonParser(Parser): + def __init__(self): + super().__init__() + + def parse(self, file: TextIO, handler: ParseHandler): + for line in file: + obj = json.loads(line) + handler.handle(obj) + + +class Coverage: + def __init__(self, file: str, line: str | int, comment: str | None = None): + self.file = file + self.line = int(line) + self.comment = comment + + def to_tuple(self) -> Tuple[str, int, str | None]: + return self.file, self.line, self.comment + + def __eq__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() == other + elif isinstance(other, Coverage): + return self.to_tuple() == other.to_tuple() + else: + return False + + def __lt__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() < other + elif isinstance(other, Coverage): + return self.to_tuple() < other.to_tuple() + else: + return False + + def __le__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() <= other + elif isinstance(other, Coverage): + return self.to_tuple() <= other.to_tuple() + else: + return False + + def __gt__(self, other: Coverage) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() > other + elif isinstance(other, Coverage): + return self.to_tuple() > other.to_tuple() + else: + return False + + def __ge__(self, other): + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() >= other + elif isinstance(other, Coverage): + return self.to_tuple() >= other.to_tuple() + else: + return False + + def __hash__(self): + return hash((self.file, self.line, self.comment)) + + +class TraceFiles: + def __init__(self, path: Path): + self.path: Path = path + self.timestamps: List[int] = [] + self.runs: OrderedDict[int, List[Path]] = collections.OrderedDict() + trace_expr = re.compile(r'trace.*\.(json|xml)') + for file in self.path.iterdir(): + if file.is_file() and trace_expr.match(file.name) is not None: + ts = int(file.name.split('.')[6]) + if ts in self.runs: + self.runs[ts].append(file) + else: + self.timestamps.append(ts) + self.runs[ts] = [file] + self.timestamps.sort(reverse=True) + + def __getitem__(self, idx: int) -> List[Path]: + res = self.runs[self.timestamps[idx]] + res.sort() + return res + + def __len__(self) -> int: + return len(self.runs) + + def items(self) -> Iterator[List[Path]]: + class TraceFilesIterator(Iterable[List[Path]]): + def __init__(self, trace_files: TraceFiles): + self.current = 0 + self.trace_files: TraceFiles = trace_files + + def __iter__(self): + return self + + def __next__(self) -> List[Path]: + if len(self.trace_files) <= self.current: + raise StopIteration + self.current += 1 + return self.trace_files[self.current - 1] + return TraceFilesIterator(self) + + +class Summary: + def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None, + was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None, + exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None, + error_out: str = None, will_restart: bool = False): + self.binary = binary + self.runtime: float = runtime + self.max_rss: int | None = max_rss + self.was_killed: bool = was_killed + self.expected_unseed: int | None = expected_unseed + self.exit_code: int = exit_code + self.out: SummaryTree = SummaryTree('Test') + self.test_begin_found: bool = False + self.test_end_found: bool = False + self.unseed: int | None = None + self.valgrind_out_file: Path | None = valgrind_out_file + self.severity_map: OrderedDict[tuple[str, int], int] = collections.OrderedDict() + self.error: bool = False + self.errors: int = 0 + self.warnings: int = 0 + self.coverage: OrderedDict[Coverage, bool] = collections.OrderedDict() + self.test_count: int = 0 + self.tests_passed: int = 0 + self.error_out = error_out + self.stderr_severity: str = '40' + self.will_restart: bool = will_restart + self.test_dir: Path | None = None + + if uid is not None: + self.out.attributes['TestUID'] = str(uid) + if stats is not None: + self.out.attributes['Statistics'] = stats + self.out.attributes['JoshuaSeed'] = str(config.joshua_seed) + self.out.attributes['WillRestart'] = '1' if self.will_restart else '0' + + self.handler = ParseHandler(self.out) + self.register_handlers() + + def summarize_files(self, trace_files: List[Path]): + assert len(trace_files) > 0 + for f in trace_files: + self.parse_file(f) + self.done() + + def summarize(self, trace_dir: Path, command: str): + self.test_dir = trace_dir + trace_files = TraceFiles(trace_dir) + if len(trace_files) == 0: + self.error = True + child = SummaryTree('NoTracesFound') + child.attributes['Severity'] = '40' + child.attributes['Path'] = str(trace_dir.absolute()) + child.attributes['Command'] = command + self.out.append(child) + return + self.summarize_files(trace_files[0]) + if config.joshua_dir is not None: + import test_harness.fdb + test_harness.fdb.write_coverage(config.cluster_file, + test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage',), + test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage-metadata',), + self.coverage) + + def list_simfdb(self) -> SummaryTree: + res = SummaryTree('SimFDB') + res.attributes['TestDir'] = str(self.test_dir) + if self.test_dir is None: + return res + simfdb = self.test_dir / Path('simfdb') + if not simfdb.exists(): + res.attributes['NoSimDir'] = "simfdb doesn't exist" + return res + elif not simfdb.is_dir(): + res.attributes['NoSimDir'] = 'simfdb is not a directory' + return res + for file in simfdb.iterdir(): + child = SummaryTree('Directory' if file.is_dir() else 'File') + child.attributes['Name'] = file.name + res.append(child) + return res + + def ok(self): + return not self.error + + def done(self): + if config.print_coverage: + for k, v in self.coverage.items(): + child = SummaryTree('CodeCoverage') + child.attributes['File'] = k.file + child.attributes['Line'] = str(k.line) + if not v: + child.attributes['Covered'] = '0' + if k.comment is not None and len(k.comment): + child.attributes['Comment'] = k.comment + self.out.append(child) + if self.warnings > config.max_warnings: + child = SummaryTree('WarningLimitExceeded') + child.attributes['Severity'] = '30' + child.attributes['WarningCount'] = str(self.warnings) + self.out.append(child) + if self.errors > config.max_errors: + child = SummaryTree('ErrorLimitExceeded') + child.attributes['Severity'] = '40' + child.attributes['ErrorCount'] = str(self.errors) + self.out.append(child) + if self.was_killed: + child = SummaryTree('ExternalTimeout') + child.attributes['Severity'] = '40' + self.out.append(child) + self.error = True + if self.max_rss is not None: + self.out.attributes['PeakMemory'] = str(self.max_rss) + if self.valgrind_out_file is not None: + try: + valgrind_errors = parse_valgrind_output(self.valgrind_out_file) + for valgrind_error in valgrind_errors: + if valgrind_error.kind.startswith('Leak'): + continue + self.error = True + child = SummaryTree('ValgrindError') + child.attributes['Severity'] = '40' + child.attributes['What'] = valgrind_error.what.what + child.attributes['Backtrace'] = valgrind_error.what.backtrace + aux_count = 0 + for aux in valgrind_error.aux: + child.attributes['WhatAux{}'.format(aux_count)] = aux.what + child.attributes['BacktraceAux{}'.format(aux_count)] = aux.backtrace + aux_count += 1 + self.out.append(child) + except Exception as e: + self.error = True + child = SummaryTree('ValgrindParseError') + child.attributes['Severity'] = '40' + child.attributes['ErrorMessage'] = str(e) + _, _, exc_traceback = sys.exc_info() + child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + self.out.append(child) + if not self.test_end_found: + child = SummaryTree('TestUnexpectedlyNotFinished') + child.attributes['Severity'] = '40' + self.out.append(child) + if self.error_out is not None and len(self.error_out) > 0: + lines = self.error_out.splitlines() + stderr_bytes = 0 + for line in lines: + if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"): + # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives. + continue + if line.endswith("Warning: unimplemented fcntl command: 1036"): + # Valgrind produces this warning when F_SET_RW_HINT is used + continue + if self.stderr_severity == '40': + self.error = True + remaining_bytes = config.max_stderr_bytes - stderr_bytes + if remaining_bytes > 0: + out_err = line[0:remaining_bytes] + ('...' if len(line) > remaining_bytes else '') + child = SummaryTree('StdErrOutput') + child.attributes['Severity'] = self.stderr_severity + child.attributes['Output'] = out_err + self.out.append(child) + stderr_bytes += len(line) + if stderr_bytes > config.max_stderr_bytes: + child = SummaryTree('StdErrOutputTruncated') + child.attributes['Severity'] = self.stderr_severity + child.attributes['BytesRemaining'] = str(stderr_bytes - config.max_stderr_bytes) + self.out.append(child) + + self.out.attributes['Ok'] = '1' if self.ok() else '0' + if not self.ok(): + reason = 'Unknown' + if self.error: + reason = 'ProducedErrors' + elif not self.test_end_found: + reason = 'TestDidNotFinish' + elif self.tests_passed == 0: + reason = 'NoTestsPassed' + elif self.test_count != self.tests_passed: + reason = 'Expected {} tests to pass, but only {} did'.format(self.test_count, self.tests_passed) + self.out.attributes['FailReason'] = reason + + def parse_file(self, file: Path): + parser: Parser + if file.suffix == '.json': + parser = JsonParser() + elif file.suffix == '.xml': + parser = XmlParser() + else: + child = SummaryTree('TestHarnessBug') + child.attributes['File'] = __file__ + frame = inspect.currentframe() + if frame is not None: + child.attributes['Line'] = str(inspect.getframeinfo(frame).lineno) + child.attributes['Details'] = 'Unexpected suffix {} for file {}'.format(file.suffix, file.name) + self.error = True + self.out.append(child) + return + with file.open('r') as f: + try: + parser.parse(f, self.handler) + except Exception as e: + child = SummaryTree('SummarizationError') + child.attributes['Severity'] = '40' + child.attributes['ErrorMessage'] = str(e) + self.out.append(child) + + def register_handlers(self): + def remap_event_severity(attrs): + if 'Type' not in attrs or 'Severity' not in attrs: + return None + k = (attrs['Type'], int(attrs['Severity'])) + if k in self.severity_map: + return str(self.severity_map[k]) + + self.handler.add_handler(('Severity', None), remap_event_severity) + + def program_start(attrs: Dict[str, str]): + if self.test_begin_found: + return + self.test_begin_found = True + self.out.attributes['RandomSeed'] = attrs['RandomSeed'] + self.out.attributes['SourceVersion'] = attrs['SourceVersion'] + self.out.attributes['Time'] = attrs['ActualTime'] + self.out.attributes['BuggifyEnabled'] = attrs['BuggifyEnabled'] + self.out.attributes['DeterminismCheck'] = '0' if self.expected_unseed is None else '1' + if self.binary.name != 'fdbserver': + self.out.attributes['OldBinary'] = self.binary.name + if 'FaultInjectionEnabled' in attrs: + self.out.attributes['FaultInjectionEnabled'] = attrs['FaultInjectionEnabled'] + + self.handler.add_handler(('Type', 'ProgramStart'), program_start) + + def set_test_file(attrs: Dict[str, str]): + test_file = Path(attrs['TestFile']) + cwd = Path('.').absolute() + try: + test_file = test_file.relative_to(cwd) + except ValueError: + pass + self.out.attributes['TestFile'] = str(test_file) + + self.handler.add_handler(('Type', 'Simulation'), set_test_file) + self.handler.add_handler(('Type', 'NonSimulationTest'), set_test_file) + + def set_elapsed_time(attrs: Dict[str, str]): + if self.test_end_found: + return + self.test_end_found = True + self.unseed = int(attrs['RandomUnseed']) + if self.expected_unseed is not None and self.unseed != self.expected_unseed: + severity = 40 if ('UnseedMismatch', 40) not in self.severity_map \ + else self.severity_map[('UnseedMismatch', 40)] + if severity >= 30: + child = SummaryTree('UnseedMismatch') + child.attributes['Unseed'] = str(self.unseed) + child.attributes['ExpectedUnseed'] = str(self.expected_unseed) + child.attributes['Severity'] = str(severity) + if severity >= 40: + self.error = True + self.out.append(child) + self.out.attributes['SimElapsedTime'] = attrs['SimTime'] + self.out.attributes['RealElapsedTime'] = attrs['RealTime'] + if self.unseed is not None: + self.out.attributes['RandomUnseed'] = str(self.unseed) + + self.handler.add_handler(('Type', 'ElapsedTime'), set_elapsed_time) + + def parse_warning(attrs: Dict[str, str]): + self.warnings += 1 + if self.warnings > config.max_warnings: + return + child = SummaryTree(attrs['Type']) + for k, v in attrs.items(): + if k != 'Type': + child.attributes[k] = v + self.out.append(child) + + self.handler.add_handler(('Severity', '30'), parse_warning) + + def parse_error(attrs: Dict[str, str]): + self.errors += 1 + self.error = True + if self.errors > config.max_errors: + return + child = SummaryTree(attrs['Type']) + for k, v in attrs.items(): + child.attributes[k] = v + self.out.append(child) + + self.handler.add_handler(('Severity', '40'), parse_error) + + def coverage(attrs: Dict[str, str]): + covered = True + if 'Covered' in attrs: + covered = int(attrs['Covered']) != 0 + comment = '' + if 'Comment' in attrs: + comment = attrs['Comment'] + c = Coverage(attrs['File'], attrs['Line'], comment) + if covered or c not in self.coverage: + self.coverage[c] = covered + + self.handler.add_handler(('Type', 'CodeCoverage'), coverage) + + def expected_test_pass(attrs: Dict[str, str]): + self.test_count = int(attrs['Count']) + + self.handler.add_handler(('Type', 'TestsExpectedToPass'), expected_test_pass) + + def test_passed(attrs: Dict[str, str]): + if attrs['Passed'] == '1': + self.tests_passed += 1 + + self.handler.add_handler(('Type', 'TestResults'), test_passed) + + def remap_event_severity(attrs: Dict[str, str]): + self.severity_map[(attrs['TargetEvent'], int(attrs['OriginalSeverity']))] = int(attrs['NewSeverity']) + + self.handler.add_handler(('Type', 'RemapEventSeverity'), remap_event_severity) + + def buggify_section(attrs: Dict[str, str]): + if attrs['Type'] == 'FaultInjected' or attrs.get('Activated', '0') == '1': + child = SummaryTree(attrs['Type']) + child.attributes['File'] = attrs['File'] + child.attributes['Line'] = attrs['Line'] + self.out.append(child) + self.handler.add_handler(('Type', 'BuggifySection'), buggify_section) + self.handler.add_handler(('Type', 'FaultInjected'), buggify_section) + + def running_unit_test(attrs: Dict[str, str]): + child = SummaryTree('RunningUnitTest') + child.attributes['Name'] = attrs['Name'] + child.attributes['File'] = attrs['File'] + child.attributes['Line'] = attrs['Line'] + self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test) + + def stderr_severity(attrs: Dict[str, str]): + if 'NewSeverity' in attrs: + self.stderr_severity = attrs['NewSeverity'] + self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity) diff --git a/contrib/TestHarness2/test_harness/test_valgrind_parser.py b/contrib/TestHarness2/test_harness/test_valgrind_parser.py new file mode 100644 index 0000000000..0b36e8e6d5 --- /dev/null +++ b/contrib/TestHarness2/test_harness/test_valgrind_parser.py @@ -0,0 +1,16 @@ +import sys + +from test_harness.valgrind import parse_valgrind_output +from pathlib import Path + + +if __name__ == '__main__': + errors = parse_valgrind_output(Path(sys.argv[1])) + for valgrind_error in errors: + print('ValgrindError: what={}, kind={}'.format(valgrind_error.what.what, valgrind_error.kind)) + print('Backtrace: {}'.format(valgrind_error.what.backtrace)) + counter = 0 + for aux in valgrind_error.aux: + print('Aux {}:'.format(counter)) + print(' What: {}'.format(aux.what)) + print(' Backtrace: {}'.format(aux.backtrace)) diff --git a/contrib/TestHarness2/test_harness/timeout.py b/contrib/TestHarness2/test_harness/timeout.py new file mode 100644 index 0000000000..90af7096fd --- /dev/null +++ b/contrib/TestHarness2/test_harness/timeout.py @@ -0,0 +1,60 @@ +import argparse +import re +import sys + +from pathlib import Path +from test_harness.config import config +from test_harness.summarize import Summary, TraceFiles +from typing import Pattern, List + + +def files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]: + res: List[Path] = [] + for file in path.iterdir(): + if file.is_file() and pattern.match(file.name) is not None: + res.append(file) + elif file.is_dir() and recurse: + res += files_matching(file, pattern, recurse) + return res + + +def dirs_with_files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]: + res: List[Path] = [] + sub_directories: List[Path] = [] + has_file = False + for file in path.iterdir(): + if file.is_file() and pattern.match(file.name) is not None: + has_file = True + elif file.is_dir() and recurse: + sub_directories.append(file) + if has_file: + res.append(path) + if recurse: + for file in sub_directories: + res += dirs_with_files_matching(file, pattern, recurse=True) + res.sort() + return res + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('TestHarness Timeout', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + valgrind_files: List[Path] = [] + if config.use_valgrind: + valgrind_files = files_matching(Path.cwd(), re.compile(r'valgrind.*\.xml')) + + for directory in dirs_with_files_matching(Path.cwd(), re.compile(r'trace.*\.(json|xml)'), recurse=True): + trace_files = TraceFiles(directory) + for files in trace_files.items(): + if config.use_valgrind: + for valgrind_file in valgrind_files: + summary = Summary(Path('bin/fdbserver'), was_killed=True) + summary.valgrind_out_file = valgrind_file + summary.summarize_files(files) + summary.out.dump(sys.stdout) + else: + summary = Summary(Path('bin/fdbserver'), was_killed=True) + summary.summarize_files(files) + summary.out.dump(sys.stdout) diff --git a/contrib/TestHarness2/test_harness/valgrind.py b/contrib/TestHarness2/test_harness/valgrind.py new file mode 100644 index 0000000000..399b47c0cc --- /dev/null +++ b/contrib/TestHarness2/test_harness/valgrind.py @@ -0,0 +1,141 @@ +import enum +import xml +import xml.sax.handler +from pathlib import Path +from typing import List + + +class ValgrindWhat: + def __init__(self): + self.what: str = '' + self.backtrace: str = '' + + +class ValgrindError: + def __init__(self): + self.what: ValgrindWhat = ValgrindWhat() + self.kind: str = '' + self.aux: List[ValgrindWhat] = [] + + +# noinspection PyArgumentList +class ValgrindParseState(enum.Enum): + ROOT = enum.auto() + ERROR = enum.auto() + ERROR_AUX = enum.auto() + KIND = enum.auto() + WHAT = enum.auto() + TRACE = enum.auto() + AUX_WHAT = enum.auto() + STACK = enum.auto() + STACK_AUX = enum.auto() + STACK_IP = enum.auto() + STACK_IP_AUX = enum.auto() + + +class ValgrindHandler(xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.stack: List[ValgrindError] = [] + self.result: List[ValgrindError] = [] + self.state_stack: List[ValgrindParseState] = [] + + def state(self) -> ValgrindParseState: + if len(self.state_stack) == 0: + return ValgrindParseState.ROOT + return self.state_stack[-1] + + @staticmethod + def from_content(content): + # pdb.set_trace() + if isinstance(content, bytes): + return content.decode() + assert isinstance(content, str) + return content + + def characters(self, content): + # pdb.set_trace() + state = self.state() + if len(self.state_stack) == 0: + return + else: + assert len(self.stack) > 0 + if state is ValgrindParseState.KIND: + self.stack[-1].kind += self.from_content(content) + elif state is ValgrindParseState.WHAT: + self.stack[-1].what.what += self.from_content(content) + elif state is ValgrindParseState.AUX_WHAT: + self.stack[-1].aux[-1].what += self.from_content(content) + elif state is ValgrindParseState.STACK_IP: + self.stack[-1].what.backtrace += self.from_content(content) + elif state is ValgrindParseState.STACK_IP_AUX: + self.stack[-1].aux[-1].backtrace += self.from_content(content) + + def startElement(self, name, attrs): + # pdb.set_trace() + if name == 'error': + self.stack.append(ValgrindError()) + self.state_stack.append(ValgrindParseState.ERROR) + if len(self.stack) == 0: + return + if name == 'kind': + self.state_stack.append(ValgrindParseState.KIND) + elif name == 'what': + self.state_stack.append(ValgrindParseState.WHAT) + elif name == 'auxwhat': + assert self.state() in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX] + self.state_stack.pop() + self.state_stack.append(ValgrindParseState.ERROR_AUX) + self.state_stack.append(ValgrindParseState.AUX_WHAT) + self.stack[-1].aux.append(ValgrindWhat()) + elif name == 'stack': + state = self.state() + assert state in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX] + if state == ValgrindParseState.ERROR: + self.state_stack.append(ValgrindParseState.STACK) + else: + self.state_stack.append(ValgrindParseState.STACK_AUX) + elif name == 'ip': + state = self.state() + assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + if state == ValgrindParseState.STACK: + self.state_stack.append(ValgrindParseState.STACK_IP) + if len(self.stack[-1].what.backtrace) == 0: + self.stack[-1].what.backtrace = 'addr2line -e fdbserver.debug -p -C -f -i ' + else: + self.stack[-1].what.backtrace += ' ' + else: + self.state_stack.append(ValgrindParseState.STACK_IP_AUX) + if len(self.stack[-1].aux[-1].backtrace) == 0: + self.stack[-1].aux[-1].backtrace = 'addr2line -e fdbserver.debug -p -C -f -i ' + else: + self.stack[-1].aux[-1].backtrace += ' ' + + def endElement(self, name): + # pdb.set_trace() + if name == 'error': + self.result.append(self.stack.pop()) + self.state_stack.pop() + elif name == 'kind': + assert self.state() == ValgrindParseState.KIND + self.state_stack.pop() + elif name == 'what': + assert self.state() == ValgrindParseState.WHAT + self.state_stack.pop() + elif name == 'auxwhat': + assert self.state() == ValgrindParseState.AUX_WHAT + self.state_stack.pop() + elif name == 'stack': + assert self.state() in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + self.state_stack.pop() + elif name == 'ip': + self.state_stack.pop() + state = self.state() + assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + + +def parse_valgrind_output(valgrind_out_file: Path) -> List[ValgrindError]: + handler = ValgrindHandler() + with valgrind_out_file.open('r') as f: + xml.sax.parse(f, handler) + return handler.result diff --git a/contrib/TestHarness2/test_harness/version.py b/contrib/TestHarness2/test_harness/version.py new file mode 100644 index 0000000000..fe04206a8a --- /dev/null +++ b/contrib/TestHarness2/test_harness/version.py @@ -0,0 +1,66 @@ +from functools import total_ordering +from pathlib import Path +from typing import Tuple + + +@total_ordering +class Version: + def __init__(self): + self.major: int = 0 + self.minor: int = 0 + self.patch: int = 0 + + def version_tuple(self): + return self.major, self.minor, self.patch + + def _compare(self, other) -> int: + lhs: Tuple[int, int, int] = self.version_tuple() + rhs: Tuple[int, int, int] + if isinstance(other, Version): + rhs = other.version_tuple() + else: + rhs = Version.parse(str(other)).version_tuple() + if lhs < rhs: + return -1 + elif lhs > rhs: + return 1 + else: + return 0 + + def __eq__(self, other) -> bool: + return self._compare(other) == 0 + + def __lt__(self, other) -> bool: + return self._compare(other) < 0 + + def __hash__(self): + return hash(self.version_tuple()) + + def __str__(self): + return format('{}.{}.{}'.format(self.major, self.minor, self.patch)) + + @staticmethod + def of_binary(binary: Path): + parts = binary.name.split('-') + if len(parts) != 2: + return Version.max_version() + return Version.parse(parts[1]) + + @staticmethod + def parse(version: str): + version_tuple = version.split('.') + self = Version() + self.major = int(version_tuple[0]) + if len(version_tuple) > 1: + self.minor = int(version_tuple[1]) + if len(version_tuple) > 2: + self.patch = int(version_tuple[2]) + return self + + @staticmethod + def max_version(): + self = Version() + self.major = 2**32 - 1 + self.minor = 2**32 - 1 + self.patch = 2**32 - 1 + return self diff --git a/contrib/boost_zstd/zstd.cpp b/contrib/boost_zstd/zstd.cpp new file mode 100644 index 0000000000..9b2885ede1 --- /dev/null +++ b/contrib/boost_zstd/zstd.cpp @@ -0,0 +1,149 @@ +// (C) Copyright Reimar Döffinger 2018. +// Based on zstd.cpp by: +// (C) Copyright Milan Svoboda 2008. +// (C) Copyright Jonathan Turkanis 2003. +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt.) + +// See http://www.boost.org/libs/iostreams for documentation. + +// Define BOOST_IOSTREAMS_SOURCE so that +// knows that we are building the library (possibly exporting code), rather +// than using it (possibly importing code). +#define BOOST_IOSTREAMS_SOURCE + +#include + +#include +#include +#include + +namespace boost { +namespace iostreams { + +namespace zstd { +// Compression levels + +const uint32_t best_speed = 1; +const uint32_t best_compression = 19; +const uint32_t default_compression = 3; + +// Status codes + +const int okay = 0; +const int stream_end = 1; + +// Flush codes + +const int finish = 0; +const int flush = 1; +const int run = 2; +} // End namespace zstd. + +//------------------Implementation of zstd_error------------------------------// + +zstd_error::zstd_error(size_t error) : BOOST_IOSTREAMS_FAILURE(ZSTD_getErrorName(error)), error_(error) {} + +void zstd_error::check BOOST_PREVENT_MACRO_SUBSTITUTION(size_t error) { + if (ZSTD_isError(error)) + boost::throw_exception(zstd_error(error)); +} + +//------------------Implementation of zstd_base-------------------------------// + +namespace detail { + +zstd_base::zstd_base() + : cstream_(ZSTD_createCStream()), dstream_(ZSTD_createDStream()), in_(new ZSTD_inBuffer), out_(new ZSTD_outBuffer), + eof_(0) {} + +zstd_base::~zstd_base() { + ZSTD_freeCStream(static_cast(cstream_)); + ZSTD_freeDStream(static_cast(dstream_)); + delete static_cast(in_); + delete static_cast(out_); +} + +void zstd_base::before(const char*& src_begin, const char* src_end, char*& dest_begin, char* dest_end) { + ZSTD_inBuffer* in = static_cast(in_); + ZSTD_outBuffer* out = static_cast(out_); + in->src = src_begin; + in->size = static_cast(src_end - src_begin); + in->pos = 0; + out->dst = dest_begin; + out->size = static_cast(dest_end - dest_begin); + out->pos = 0; +} + +void zstd_base::after(const char*& src_begin, char*& dest_begin, bool) { + ZSTD_inBuffer* in = static_cast(in_); + ZSTD_outBuffer* out = static_cast(out_); + src_begin = reinterpret_cast(in->src) + in->pos; + dest_begin = reinterpret_cast(out->dst) + out->pos; +} + +int zstd_base::deflate(int action) { + ZSTD_CStream* s = static_cast(cstream_); + ZSTD_inBuffer* in = static_cast(in_); + ZSTD_outBuffer* out = static_cast(out_); + // Ignore spurious extra calls. + // Note size > 0 will trigger an error in this case. + if (eof_ && in->size == 0) + return zstd::stream_end; + size_t result = ZSTD_compressStream(s, out, in); + zstd_error::check BOOST_PREVENT_MACRO_SUBSTITUTION(result); + if (action != zstd::run) { + result = action == zstd::finish ? ZSTD_endStream(s, out) : ZSTD_flushStream(s, out); + zstd_error::check BOOST_PREVENT_MACRO_SUBSTITUTION(result); + eof_ = action == zstd::finish && result == 0; + return result == 0 ? zstd::stream_end : zstd::okay; + } + return zstd::okay; +} + +int zstd_base::inflate(int action) { + ZSTD_DStream* s = static_cast(dstream_); + ZSTD_inBuffer* in = static_cast(in_); + ZSTD_outBuffer* out = static_cast(out_); + // need loop since iostream code cannot handle short reads + do { + size_t result = ZSTD_decompressStream(s, out, in); + zstd_error::check BOOST_PREVENT_MACRO_SUBSTITUTION(result); + } while (in->pos < in->size && out->pos < out->size); + return action == zstd::finish && in->size == 0 && out->pos == 0 ? zstd::stream_end : zstd::okay; +} + +void zstd_base::reset(bool compress, bool realloc) { + ZSTD_inBuffer* in = static_cast(in_); + ZSTD_outBuffer* out = static_cast(out_); + if (realloc) { + memset(in, 0, sizeof(*in)); + memset(out, 0, sizeof(*out)); + eof_ = 0; + + zstd_error::check BOOST_PREVENT_MACRO_SUBSTITUTION( + compress ? ZSTD_initCStream(static_cast(cstream_), level) + : ZSTD_initDStream(static_cast(dstream_))); + } +} + +void zstd_base::do_init(const zstd_params& p, bool compress, zstd::alloc_func, zstd::free_func, void*) { + ZSTD_inBuffer* in = static_cast(in_); + ZSTD_outBuffer* out = static_cast(out_); + + memset(in, 0, sizeof(*in)); + memset(out, 0, sizeof(*out)); + eof_ = 0; + + level = p.level; + zstd_error::check BOOST_PREVENT_MACRO_SUBSTITUTION( + compress ? ZSTD_initCStream(static_cast(cstream_), level) + : ZSTD_initDStream(static_cast(dstream_))); +} + +} // End namespace detail. + +//----------------------------------------------------------------------------// + +} // namespace iostreams +} // namespace boost diff --git a/contrib/libb64/cdecode.c b/contrib/libb64/cdecode.c index 7148223625..5a833ab689 100644 --- a/contrib/libb64/cdecode.c +++ b/contrib/libb64/cdecode.c @@ -7,16 +7,16 @@ For details, see http://sourceforge.net/projects/libb64 #include "libb64/cdecode.h" -int base64_decode_value(char value_in) { - static const char decoding[] = { 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -2, -1, - -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 }; - static const char decoding_size = sizeof(decoding); +int base64_decode_value(int value_in) { + static const int decoding[] = { 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -2, -1, + -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 }; + static const int decoding_size = sizeof(decoding) / sizeof(decoding[0]); value_in -= 43; - if (value_in < 0 || value_in > decoding_size) + if (value_in < 0 || value_in >= decoding_size) return -1; - return decoding[(int)value_in]; + return decoding[value_in]; } void base64_init_decodestate(base64_decodestate* state_in) { @@ -27,7 +27,7 @@ void base64_init_decodestate(base64_decodestate* state_in) { int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in) { const char* codechar = code_in; char* plainchar = plaintext_out; - char fragment; + int fragment = 0; *plainchar = state_in->plainchar; @@ -40,9 +40,9 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar = (fragment & 0x03f) << 2; + *plainchar = (char)((fragment & 0x03f) << 2); case step_b: do { if (codechar == code_in + length_in) { @@ -50,10 +50,10 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar++ |= (fragment & 0x030) >> 4; - *plainchar = (fragment & 0x00f) << 4; + *plainchar++ |= (char)((fragment & 0x030) >> 4); + *plainchar = (char)((fragment & 0x00f) << 4); case step_c: do { if (codechar == code_in + length_in) { @@ -61,10 +61,10 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar++ |= (fragment & 0x03c) >> 2; - *plainchar = (fragment & 0x003) << 6; + *plainchar++ |= (char)((fragment & 0x03c) >> 2); + *plainchar = (char)((fragment & 0x003) << 6); case step_d: do { if (codechar == code_in + length_in) { @@ -72,9 +72,9 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar++ |= (fragment & 0x03f); + *plainchar++ |= (char)((fragment & 0x03f)); } } /* control should not reach here */ diff --git a/contrib/libb64/include/libb64/cdecode.h b/contrib/libb64/include/libb64/cdecode.h index 26d5873f22..04655b3a95 100644 --- a/contrib/libb64/include/libb64/cdecode.h +++ b/contrib/libb64/include/libb64/cdecode.h @@ -17,7 +17,7 @@ typedef struct { void base64_init_decodestate(base64_decodestate* state_in); -int base64_decode_value(char value_in); +int base64_decode_value(int value_in); int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in); diff --git a/contrib/observability_splunk_dashboard/details.xml b/contrib/observability_splunk_dashboard/details.xml new file mode 100644 index 0000000000..70ff15883b --- /dev/null +++ b/contrib/observability_splunk_dashboard/details.xml @@ -0,0 +1,431 @@ +
+ + Details for FoundationDB Cluster +
+ + + * + + + + * + + + + + -60m@m + now + + + + + Default + 5 seconds + 1 minute + 10 minutes + 1 hour + 1 day + bins=100 + bins=100 + + + + All + Storage Server + Transaction Log + Proxy + Resolver + Master + Cluster Controller + Log Router + Data Distributor + Ratekeeper + Tester + + + + + * + + + + * + +
+ + + + Storage Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | rex field=BytesDurable "(?<DurableRate>.*) (?<DurableRoughness>.*) (?<DurableCounter>.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Input Rate + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | timechart $Span$ avg(InputRate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Bytes Queried + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?<Rate>.*) (?<Roughness>.*) (?<Counter>.*)" | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Average Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Max Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Disk Busyness + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Max Run Loop Busyness by Role (for <=6.1, S2Pri1) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Max Run Loop Busyness by Priority (6.2+ only) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness<<MATCHSTR>>=PriorityBusy<<MATCHSTR>>/Elapsed] | timechart $Span$ max(Busyness*) + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + TLog Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Connection Timeouts (counted on both sides of connection) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?<OtherAddr>[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Pairwise Connection Timeouts Between Datacenters + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) host=* Machine=* NOT TrackLatestType=Rolled +| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) +| rex field=host "(?<Datacenter>..).*" +| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter) +| rex field=WithAddr "(?<OtherIP>[^:]*):.*" +| join OtherIP + [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled + | rex field=Machine "(?<OtherIP>[^:]*):.*" + | rex field=host "(?<OtherDatacenter>..).*" + | eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)] +| eval DC1=if(Datacenter>OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter>OtherDatacenter, OtherDatacenter, Datacenter) +| eval Connection=DC1+" <-> " + DC2 +| eval Count=1+SuppressedEventCount +| timechart count by Connection + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?<IP>[^:]*):.*" | rex field=host "(?<Datacenter>..).*" | rex field=WithAddr "(?<OtherIP>[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?<OtherIP>[^:]*):.*" | rex field=host "(?<OtherDatacenter>..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP>OtherIP, IP, OtherIP), Addr2=if(IP>OtherIP, OtherIP, IP) | eval Connection=Addr1+" <-> " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Lazy Deletion Rate (making space available for reuse) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Vacuuming Rate (shrinking file) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Roles + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + + Slow Tasks (Sorted by Duration, Descending) + + index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+ + + Event Counts (Sorted by Severity and Count, Descending) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Errors + + index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+
+ + + + Recoveries (Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan<RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan < RecoveryResetInterval)" | where NOT RecoverySpan < RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Process (Re)starts + + index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Failure Detection (Machine Filter Only) + + index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + +
+ + + + Storage Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + TLog Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent, latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Data Movement by Type (Log Scale, Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as * + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Storage Server Max Bytes Stored by Host + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100 + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Master Failed Clients + + index=$Index$ LogGroup=$LogGroup$ Type=WaitFailureClient +| stats count by FailedEndpoint + $TimeRange.earliest$ + $TimeRange.latest$ + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/performance_overview.xml b/contrib/observability_splunk_dashboard/performance_overview.xml new file mode 100644 index 0000000000..0719e2bbab --- /dev/null +++ b/contrib/observability_splunk_dashboard/performance_overview.xml @@ -0,0 +1,323 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 60s + +
+ + + Transaction Rate measured on Proxies + + Sum in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0) +| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Read Rate measured on Storage Servers + + Average in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" +| rex field=RowsQueried "(?<KRate>.*) (?<KRoughness>.*) (?<KCounter>.*)" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Proxies + + 1min Average + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " MutationBytes +| makemv delim=" " Mutations +| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0) +| bucket span=5s _time +| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time +|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000 +| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Storage Servers + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + GRV Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Commit Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=CommitLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Read Latency measured on all Storage Servers + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=ReadLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + RateKeeper: ReleasedTPS vs LimitTPS + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + RateKeeper: Throttling Reason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + RateKeeper: Throttling Server + + Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records) + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original" +| streamstats count as numOfEvents +| where numOfEvents < 10 +| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S") +| table DateTime, ReasonServerID + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+
+ + + Disk Overhead = Disk Usage / Logical KV Size + + Y-axis is capped at 10 + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time +| eval overhead=StorageDiskUsedBytes/LogicalKVBytes +| timechart avg(overhead) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + KV Data Size + + + index=$Index$ LogGroup=$LogGroup$ +Roles=*DD* host=* Machine=* Type=DDTrackerStats TrackLatestType=Original +| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024 +|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Disk Usage + + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* Type=StorageMetrics TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time +|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024 +| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Cluster Roles + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original" +| rex field=host "(?<HostDC>..).*-..(?<HostConfig>..).*" +| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC) +| makemv delim="," Roles +| stats dc(Machine) as MachineCount by Roles, HostDC +| stats list(HostDC), list(MachineCount) by Roles +| sort Roles + $TimeSpan.earliest$ + $TimeSpan.latest$ + + +
+
+
+ + + Storage Engine + + + index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2 + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+ + Cluster Generations + + Indicate FDB recoveries + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + +
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/ratekeeper.xml b/contrib/observability_splunk_dashboard/ratekeeper.xml new file mode 100644 index 0000000000..c4a31a8fbc --- /dev/null +++ b/contrib/observability_splunk_dashboard/ratekeeper.xml @@ -0,0 +1,928 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 30s + + + + Yes + No + + + + + MasterServer + MasterProxyServer + StorageServer + TLog + Resolver + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + +
+ + + Aggregated Storage Server Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" + | rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" + | rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" + | rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" + | bin span=5s _time + | stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time + | eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024 + |timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Aggregated Proxy Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0) +| bin span=60s _time +| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time +| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000 +| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0) +| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 3: RKOverview - RKLimitReason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + + Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkSSListFetchTimeout" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkTlogMinFreeSpaceZero" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original" +| timechart span=1s count by Type + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval NonDurableVersions=Version-DurableVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -> lower LocalRate) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(LocalRate) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -> higher probability of rejecting read)) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval SSFallBehindVersions=VersionLag +| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable +| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput +| makemv delim=" " BytesDurable +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 19: Runtime Monitoring - Proxy Throughput + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 20: Runtime Monitoring - Proxy Queue Length + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 21: Runtime Monitoring - TLog UnpoppedVersion + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall) + + + index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics" +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer" + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 23: Runtime Monitoring - StorageServer Query Queue Length + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin +| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time ID TimeSpan Machine Location Time +| bin bins=20 span=$StatsReadSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Machine + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count>=2 +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1 +| table _time TimeSpan Machine +| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Order Type ID Location Machine Roles +| sort 0 Order Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1 +| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time TimeSpan +| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count=7 +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1 +| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Step + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime +| search Count=4 +| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei +| table _time Step Duration Machinei Location Machine MachineStep +| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization=CPUSeconds/Elapsed +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = ResidentMemory/Memory +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 35: Machine Performance - Disk (Reads Count and Writes Count) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 36: Network Performance - Timeout + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 37: Network Performance - PingLatency + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=PingLatency) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency) $PingLatencyByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + +
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/recovery.xml b/contrib/observability_splunk_dashboard/recovery.xml new file mode 100644 index 0000000000..6ba6b9a63b --- /dev/null +++ b/contrib/observability_splunk_dashboard/recovery.xml @@ -0,0 +1,873 @@ +
+ +
+ + + Table 1: Find long recovery (Input Index and LogGroup and Select a time span). + + + * + + + + + + + + + -0s + now + + + + + index=$IndexForOverview$ LogGroup=$LogGroupForOverview$ + ((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup +| search NOT ErrorDescription="Success" +| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits") +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| fillnull value="-" +| sort -Time +| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0) +| stats list(*) by ID Machine ifMasterTerminatedEvent +| rename list(*) as * +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| sort -Time +| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime +| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup + $time_token_for_recoveryhistorytable.earliest$ + $time_token_for_recoveryhistorytable.latest$ + + + + +
+
+
+ + + Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts) + + + * + + + + + + + + -0s@s + now + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled") +| rename ID as MasterID +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode +| fillnull value="-" ErrorDescription Reason ErrorCode +| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode) +| fillnull value="-" StatusCode +| sort 0 -Time -StatusCode +| stats list(*) by MasterID Machine +| rename list(*) as * +| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime +| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time +| sort -MyRecoveryCount +| fillnull value="-" MyRecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B. + + + index=$Index$ LogGroup=$LogGroup$ + Type="WaitFailureClient" +| table Type Time Machine FailedEndpoint +| replace *:tls with * in FailedEndpoint +| join Machine type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End" + | eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | rename As as Role + | table ID EndTime Machine Role] +| join FailedEndpoint type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" + | stats latest(*) by ID | rename latest(*) as * + | rename Machine as FailedEndpoint + | eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(*) by FailedEndpoint + | rename list(*) as * + | table FailedEndpoint FailedEndpointLatestRoleEventInfo] +| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| makemv delim=" " FailedEndpointLatestRoleEventInfo +| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 4: New Recruitment Configuration (using MasterRecoveredConfig event) + + + index=$Index$ LogGroup=$LogGroup$ + Type="MasterRecoveredConfig" AND TrackLatestType="Original" +| eval Configuration=replace(Conf, "&quot;", "\"") +| rename Configuration as _raw + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + Table 5: Data Centers (using ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics +| dedup DCID +| rename DCID as DataCenterID +| table DataCenterID pie_work_unit +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 6: New Role (using Role event joined by ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search count=1 AND Transition="Begin" +| table ID Role Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter +| fillnull value="null" DataCenter +| stats count by Role DataCenter + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 7: Role Details + + + MasterServer + TLog + Resolver + MasterProxyServer (for <7.0) + LogRouter + CommitProxyServer (for 7.0+) + GrvProxyServer (for 7.0+) + As=" + " + OR + + + + Begin + End + Begin->End + count=1 AND Transition="Begin" + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search $RoleDetailTableWhichRoleToken$ +| table ID Role Machine Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter Time +| fillnull value="null" DataCenter +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Role Machine DataCenter DateTime +| sort 0 -DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration) + + + index=$Index$ LogGroup=$LogGroup$ + Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError" + | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-") + | table Type GoodRecruitmentTimeReady Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + +
+
+
+ + + Table 9: RecoveryCount of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore") +| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old." + -7d@h + now + + + +
+
+ + Table 10: Which roles the selected TLog (in Table 11) talks to + + + index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| table TLogID TLogEvents +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2) +| fields - temp - TLogEvents +| sort 0 -Time +| search NOT MasterID="NULL" +| dedup MasterID +| rename MasterID as ID +| join type=left ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role") + | sort 0 -Time + | dedup ID + | table ID Machine As] +| table ID Machine As | fillnull value="null" Machine As + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$ +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null") +| stats list(TLogEvents) by TLogID +| rename list(TLogEvents) As TLogEvents +| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0) +| table TLogID TLogEvents EarliestEvent LatestEvent +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND As="TLog") + | sort 0 -Time + | dedup ID + | rename ID as TLogID + | table TLogID host LogGroup Machine] +| table TLogID Machine LogGroup host EarliestEvent LatestEvent +| fillnull value="null" Machine host LogGroup +| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime +| table TLogID Machine EarliestTime Duration LogGroup host +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="null" DataCenter +| table TLogID Machine DataCenter EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled") + | sort -Time + | eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") + | stats list(*) by TLogID + | rename list(*) As * + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | sort TLogID + | table TLogID TLogEvents + | mvexpand TLogEvents + | eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2) + | fields - temp - TLogEvents + | sort 0 -Time + | search NOT RoleID="NULL" + | table TLogID RoleID MasterMachine + | stats list(*) by TLogID + | rename list(*) as * + | streamstats count + | mvexpand RoleID + | dedup count RoleID + | fields - count + | stats count by TLogID + | rename count as Roles + | table TLogID Roles] +| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled")) + | sort -Time + | eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type) + | sort 0 TLogEvents + | stats list(TLogEvents) by TLogID + | rename list(TLogEvents) As TLogEvents + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | mvcombine delim=" " TLogEvents + | table TLogID TLogEvents] +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup +| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogStart") OR (Type="TLogPersistentStateRestore") + | eval TLogID=if(Type="TLogStart", ID, LogId) + | table TLogID RecoveryCount] +| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| fillnull value="TLog too old, click and see details" RecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + $click.value$ + +
+
+ + Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$) + | dedup ID + | rename ID as TLogID + | table TLogID Machine] +| table TLogID Machine TLogEvents +| fillnull value="-" Machine +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3) +| fields - temp - TLogEvents +| join type=left + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role") + | dedup ID + | rename ID as ToID + | rename As as ToRole + | rename Machine as ToMachine + | table ToID ToRole ToMachine] +| sort 0 -Time +| fillnull value="-" ToRole ToMachine +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| rename Tags as UnpoppedRecoveredTagCount +| rename Tag as TagPopped +| rename DurableKCVer as DurableKnownCommittedVersion +| search TagPopped!="-1:2" +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt +| sort 0 -UnpoppedRecoveredTagCount +| join TagPopped type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | rename Tag as TagPopped + | table TagPopped ID Machine] +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| rename ID as SSID +| rename Machine as SSMachine +| rename DataCenter as SSDataCenter +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+ + Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| table TLogID Type AllTags Locality +| makemv delim="," AllTags +| mvexpand AllTags +| rename AllTags as Tag | sort 0 Tag +| join Tag type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | table Tag ID Machine] +| table TLogID Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="-" +| table TLogID Tag ID Machine DataCenter +| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter +| search Tag!="-1:2" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong) + + + | set diff + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") + | table AllTags + | makemv delim="," AllTags + | mvexpand AllTags + | rename AllTags as Tag + | table Tag] + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") + | table Tag] + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 16: All Current Storage Servers (assume each machine has at most one SS) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") AND $TriggerSSTableToken$ +| stats latest(*) by Machine +| rename latest(*) as * +| table Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Machine DataCenter Tag +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled")) + | stats latest(*) by Machine + | rename latest(*) as * + | rename As as Role + | table ID Role Machine + | join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] + | table ID Role Machine DataCenter + | fillnull value="null" DataCenter] +| sort 0 DataCenter +| table Tag ID Machine DataCenter | sort 0 Tag + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 1: Timeout/TimedOut event distribution grouped by source (Machine) + + + 5s + + + + TLog + MasterServer + MasterProxyServer (for version < 7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for ver 7+) + As=" + " + OR + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| stats count as TotalTimeouts by Machine PeerAddr +| table Machine PeerAddr TotalTimeouts +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS MachineRoleLatestEvent BY Machine + ] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS PeerRoleLatestEvent BY Machine + | rename Machine AS PeerAddr + ] +| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 18: Proxy 0 + + + index=$Index$ LogGroup=$LogGroup$ + (Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True" +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table WorkerID LogGroup FirstProxy Time DateTime +| sort 0 -Time +| join type=left WorkerID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND As="Worker" AND Transition="Refresh" + | dedup ID + | rename ID as WorkerID + | stats list(*) by WorkerID + | rename list(*) as * + | table WorkerID Machine Roles] +| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh" + | dedup ID + | rename ID as ProxyID + | table Machine ProxyID] +| table ProxyID Machine LogGroup FirstProxy + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500) + + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND Machine=$SearchMachineToken$ +| stats latest(*) by ID Transition +| rename latest(*) as * +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason +| sort 0 -DateTime +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 3: severity>=20 event distribution (including roles that refresh/begin/end in the timespan) + + + * + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + EventType + Machine + Severity + Type + + + + 5s + + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 AND $BadEvents$ +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID | table Machine] +| table Machine Type Severity _time +| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$ + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 20: Check severity>20 events of roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 +| stats count by Machine Type +| rename count as Count +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID + | eval Role=As."-".ID + | stats list(Role) by Machine + | rename list(Role) as Roles + | table Machine Roles] +| table Type Count Roles Machine +| sort -Count + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/transaction_latency.xml b/contrib/observability_splunk_dashboard/transaction_latency.xml new file mode 100644 index 0000000000..99b551f2c9 --- /dev/null +++ b/contrib/observability_splunk_dashboard/transaction_latency.xml @@ -0,0 +1,247 @@ +
+ + Design for ClusterController issued transactions. +
+ + + + + + + * + + + + * + + + + + @d + now + + +
+ + + All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit). + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$ + (Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID") +| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To) +| stats list(To) by ID +| rename list(To) as ToList +| table ID ToList +| eval Count = mvcount(ToList) +| search Count=3 +| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1)) +| table ID GrvID ReadID CommitID +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before") + | rename ID as GrvID + | rename Time as BeginTime + | table GrvID BeginTime + ] +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After") + | rename ID as GrvID + | rename Time as GRVDoneTime + | table GrvID GRVDoneTime + ] +| join ReadID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="GetValueDebug" AND Location="NativeAPI.getValue.After") + | rename ID as ReadID + | rename Time as ReadDoneTime + | table ReadID ReadDoneTime + ] +| join CommitID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="CommitDebug" AND Location="NativeAPI.commit.After") + | rename ID as CommitID + | rename Time as CommitDoneTime + | table CommitID CommitDoneTime + ] +| rename ID as TransactionID +| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration + $time_token.earliest$ + $time_token.latest$ + + + + $row.BeginTimeScope$ + $row.EndTimeScope$ + $row.ReadID$ + $row.GrvID$ + $row.CommitID$ + +
+
+
+ + + Step1: GRV + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion) +AND (ID=$GrvID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To]) +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Delta Order Type ID Location Machine Roles +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+ + Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events) + + only for FDB 6.3 + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion" + AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To] +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime +| sort 0 -Time +| table Machine Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step2: GetValue + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$ +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (ID=$CommitID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To]) + +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step3: Commit --- Resolver + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="Resolver*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8) +| sort 0 Time Order +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table Machine DataCenter Roles Duration Location Delta Time + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="TLog*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| sort 0 Time +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time + $BeginTime$ + $EndTime$ + + + +
+
+
+
\ No newline at end of file diff --git a/design/backup-dataFormat.md b/design/backup-dataFormat.md index 73942e41ef..f6f9a0338c 100644 --- a/design/backup-dataFormat.md +++ b/design/backup-dataFormat.md @@ -54,7 +54,7 @@ NOTE: All blocks except for the final block will have one last value which will The code related to how a range file is written is in the `struct RangeFileWriter` in `namespace fileBackup`. -The code that decodes a range block is in `ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len)`. +The code that decodes a range block is in `ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len, Database cx)`. ### Data format in a log file diff --git a/design/dynamic-knobs.md b/design/dynamic-knobs.md new file mode 100644 index 0000000000..00fe39e725 --- /dev/null +++ b/design/dynamic-knobs.md @@ -0,0 +1,420 @@ +# Dynamic Knobs + +This document is largely adapted from original design documents by Markus +Pilman and Trevor Clinkenbeard. + +## Background + +FoundationDB parameters control the behavior of the database, including whether +certain features are available and the value of internal constants. Parameters +will be referred to as knobs for the remainder of this document. Currently, +these knobs are configured through arguments passed to `fdbserver` processes, +often controlled by `fdbmonitor`. This has a number of problems: + +1. Updating knobs involves updating `foundationdb.conf` files on each host in a + cluster. This has a lot of overhead and typically requires external tooling + for large scale changes. +2. All knob changes require a process restart. +3. We can't easily track the history of knob changes. + +## Overview + +The dynamic knobs project creates a strictly serializable quorum-based +configuration database stored on the coordinators. Each `fdbserver` process +specifies a configuration path and applies knob overrides from the +configuration database for its specified classes. + +### Caveats + +The configuration database explicitly does not support the following: + +1. A high load. The update rate, while not specified, should be relatively low. +2. A large amount of data. The database is meant to be relatively small (under + one megabyte). Data is not sharded and every coordinator stores a complete + copy. +3. Concurrent writes. At most one write can succeed at a time, and clients must + retry their failed writes. + +## Design + +### Configuration Path + +Each `fdbserver` process can now include a `--config_path` argument specifying +its configuration path. A configuration path is a hierarchical list of +configuration classes specifying which knob overrides the `fdbserver` process +should apply from the configuration database. For example: + +```bash +$ fdbserver --config_path classA/classB/classC ... +``` + +Knob overrides follow descending priority: + +1. Manually specified command line knobs. +2. Individual configuration class overrides. + * Subdirectories override parent directories. For example, if the + configuration path is `az-1/storage/gp3`, the `gp3` configuration takes + priority over the `storage` configuration, which takes priority over the + `az-1` configuration. +3. Global configuration knobs. +4. Default knob values. + +#### Example + +For example, imagine an `fdbserver` process run as follows: + +```bash +$ fdbserver --datadir /mnt/fdb/storage/4500 --logdir /var/log/foundationdb --public_address auto:4500 --config_path az-1/storage/gp3 --knob_disable_asserts false +``` + +And the configuration database contains: + +| ConfigClass | KnobName | KnobValue | +|-------------|---------------------|-----------| +| az-2 | page_cache_4k | 8e9 | +| storage | min_trace_severity | 20 | +| az-1 | compaction_interval | 280 | +| storage | compaction_interval | 350 | +| az-1 | disable_asserts | true | +| \ | max_metric_size | 5000 | +| gp3 | max_metric_size | 1000 | + +The final configuration for the process will be: + +| KnobName | KnobValue | Explanation | +|---------------------|-------------|-------------| +| page_cache_4k | \ | The configuration database knob override for `az-2` is ignored, so the compiled default is used | +| min_trace_severity | 20 | Because the `storage` configuration class is part of the process’s configuration path, the corresponding knob override is applied from the configuration database | +| compaction_interval | 350 | The `storage` knob override takes precedence over the `az-1` knob override | +| disable_asserts | false | This knob is manually overridden, so all other overrides are ignored | +| max_metric_size | 1000 | Knob overrides for specific configuration classes take precedence over global knob overrides, so the global override is ignored | + +### Clients + +Clients can write to the configuration database using transactions. +Configuration database transactions are differentiated from regular +transactions through specification of the `USE_CONFIG_DATABASE` database +option. + +In configuration transactions, the client uses the tuple layer to interact with +the configuration database. Keys are tuples of size two, where the first item +is the configuration class being written, and the second item is the knob name. +The value should be specified as a string. It will be converted to the +appropriate type based on the declared type of the knob being set. + +Below is a sample Python script to write to the configuration database. + +```python +import fdb + +fdb.api_version(720) + +@fdb.transactional +def set_knob(tr, knob_name, knob_value, config_class, description): + tr['\xff\xff/description'] = description + tr[fdb.tuple.pack((config_class, knob_name,))] = knob_value + +# This function performs two knob changes transactionally. +@fdb.transactional +def set_multiple_knobs(tr): + tr['\xff\xff/description'] = 'description' + tr[fdb.tuple.pack((None, 'min_trace_severity',))] = '10' + tr[fdb.tuple.pack(('az-1', 'min_trace_severity',))] = '20' + +db = fdb.open() +db.options.set_use_config_database() + +set_knob(db, 'min_trace_severity', '10', None, 'description') +set_knob(db, 'min_trace_severity', '20', 'az-1', 'description') +``` + +### Disable the Configuration Database + +The configuration database includes both client and server changes and is +enabled by default. Thus, to disable the configuration database, changes must +be made to both. + +#### Server + +The configuration database can be disabled by specifying the ``fdbserver`` +command line option ``--no-config-db``. Note that this option must be specified +for *every* ``fdbserver`` process. + +#### Client + +The only client change from the configuration database is as part of the change +coordinators command. The change coordinators command is not considered +successful until the configuration database is readable on the new +coordinators. This will cause the change coordinators command to hang if run +against a database with dynamic knobs disabled. To disable the client side +configuration database liveness check, specify the ``--no-config-db`` flag when +changing coordinators. For example: + +``` +fdbcli> coordinators auto --no-config-db +``` + +## Status + +The current state of the configuration database is output as part of `status +json`. The configuration path for each process can be determined from the +``command_line`` key associated with each process. + +Sample from ``status json``: + +``` +"configuration_database" : { + "commits" : [ + { + "description" : "set some knobs", + "timestamp" : 1659570000, + "version" : 1 + }, + { + "description" : "make some other changes", + "timestamp" : 1659570000, + "version" : 2 + } + ], + "last_compacted_version" : 0, + "most_recent_version" : 2, + "mutations" : [ + { + "config_class" : "", + "knob_name" : "min_trace_severity", + "knob_value" : "int:5", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "", + "knob_name" : "compaction_interval", + "knob_value" : "double:30.000000", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "az-1", + "knob_name" : "compaction_interval", + "knob_value" : "double:60.000000", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "", + "knob_name" : "compaction_interval", + "type" : "clear", + "version" : 2 + }, + { + "config_class" : "", + "knob_name" : "update_node_timeout", + "knob_value" : "double:4.000000", + "type" : "set", + "version" : 2 + } + ], + "snapshot" : { + "" : { + "min_trace_severity" : "int:5", + "update_node_timeout" : "double:4.000000" + }, + "az-1" : { + "compaction_interval" : "double:60.000000" + } + } +} +``` + +After compaction, ``status json`` would show: + +``` +"configuration_database" : { + "commits" : [ + ], + "last_compacted_version" : 2, + "most_recent_version" : 2, + "mutations" : [ + ], + "snapshot" : { + "" : { + "min_trace_severity" : "int:5", + "update_node_timeout" : "double:4.000000" + }, + "az-1" : { + "compaction_interval" : "double:60.000000" + } + } +} +``` + +## Detailed Implementation + +The configuration database is implemented as a replicated state machine living +on the coordinators. This allows configuration database transactions to +continue to function in the event of a catastrophic loss of the transaction +subsystem. + +To commit a transaction, clients run the two phase Paxos protocol. First, the +client asks for a live version from a quorum of coordinators. When a +coordinator receives a request for its live version, it increments its local +live version by one and returns it to the client. Then, the client submits its +writes at the live version it received in the previous step. A coordinator will +accept the commit if it is still on the same live version. If a majority of +coordinators accept the commit, it is considered committed. + +### Coordinator + +Each coordinator runs a ``ConfigNode`` which serves as a replica storing one +full copy of the configuration database. Coordinators never communicate with +other coordinators while processing configuration database transactions. +Instead, the client runs the transaction and determines when it has quorum +agreement. + +Coordinators serve the following ``ConfigTransactionInterface`` to allow +clients to read from and write to the configuration database. + +#### ``ConfigTransactionInterface`` +| Request | Request fields | Reply fields | Explanation | +|------------------|----------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------| +| GetGeneration | (coordinatorsHash) | (generation) or (coordinators_changed error) | Get a new read version. This read version is used for all future requests in the transaction | +| Get | (configuration class, knob name, coordinatorsHash, generation) | (knob value or empty) or (coordinators_changed error) or (transaction_too_old error) | Returns the current value stored at the specified configuration class and knob name, or empty if no value exists | +| GetConfigClasses | (coordinatorsHash, generation) | (configuration classes) or (coordinators_changed error) or (transaction_too_old error) | Returns a list of all configuration classes stored in the configuration database | +| GetKnobs | (configuration class, coordinatorsHash, generation) | (knob names) or (coordinators_changed error) or (transaction_too_old error) | Returns a list of all knob names stored for the provided configuration class | +| Commit | (mutation list, coordinatorsHash, generation) | ack or (coordinators_changed error) or (commit_unknown_result error) or (not_committed error) | Commit mutations set by the transaction | + +Coordinators also serve the following ``ConfigFollowerInterface`` to provide +access to (and modification of) their current state. Most interaction through +this interface is done by the cluster controller through its +``IConfigConsumer`` implementation living on the ``ConfigBroadcaster``. + +#### ``ConfigFollowerInterface`` +| Request | Request fields | Reply fields | Explanation | +|-----------------------|----------------------------------------------------------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| GetChanges | (lastSeenVersion, mostRecentVersion) | (mutation list, version) or (version_already_compacted error) or (process_behind error) | Request changes since the last seen version, receive a new most recent version, as well as recent mutations | +| GetSnapshotAndChanges | (mostRecentVersion) | (snapshot, snapshotVersion, changes) | Request the full configuration database, in the form of a base snapshot and changes to apply on top of the snapshot | +| Compact | (version) | ack | Compact mutations up to the provided version | +| Rollforward | (rollbackTo, lastKnownCommitted, target, changes, specialZeroQuorum) | ack or (version_already_compacted error) or (transaction_too_old error) | Rollback/rollforward mutations on a node to catch it up with the majority | +| GetCommittedVersion | () | (registered, lastCompacted, lastLive, lastCommitted) | Request version information from a ``ConfigNode`` | +| Lock | (coordinatorsHash) | ack | Lock a ``ConfigNode`` to prevent it from serving requests during a coordinator change | + +### Cluster Controller + +The cluster controller runs a singleton ``ConfigBroadcaster`` which is +responsible for periodically polling the ``ConfigNode``s for updates, then +broadcasting these updates to workers through the ``ConfigBroadcastInterface``. +When workers join the cluster, they register themselves and their +``ConfigBroadcastInterface`` with the broadcaster. The broadcaster then pushes +new updates to registered workers. + +The ``ConfigBroadcastInterface`` is also used by ``ConfigNode``s to register +with the ``ConfigBroadcaster``. ``ConfigNode``s need to register with the +broadcaster because the broadcaster decides when the ``ConfigNode`` may begin +serving requests, based on global information about status of other +``ConfigNode``s. For example, if a system with three ``ConfigNode``s suffers a +fault where one ``ConfigNode`` loses data, the faulty ``ConfigNode`` should +not be allowed to begin serving requests again until it has been rolled forward +and is up to date with the latest state of the configuration database. + +#### ``ConfigBroadcastInterface`` + +| Request | Request fields | Reply fields | Explanation | +|------------|------------------------------------------------------------|-------------------------------|---------------------------------------------------------------------------------------------| +| Snapshot | (snapshot, version, restartDelay) | ack | A snapshot of the configuration database sent by the broadcaster to workers | +| Changes | (changes, mostRecentVersion, restartDelay) | ack | A list of changes up to and including mostRecentVersion, sent by the broadcaster to workers | +| Registered | () | (registered, lastSeenVersion) | Sent by the broadcaster to new ``ConfigNode``s to determine their registration status | +| Ready | (snapshot, snapshotVersion, liveVersion, coordinatorsHash) | ack | Sent by the broadcaster to new ``ConfigNode``s to allow them to start serving requests | + +### Worker + +Each worker runs a ``LocalConfiguration`` instance which receives and applies +knob updates from the ``ConfigBroadcaster``. The local configuration maintains +a durable ``KeyValueStoreMemory`` containing the following: + +* The latest known configuration version +* The most recently used configuration path +* All knob overrides corresponding to the configuration path at the latest known version + +Once a worker starts, it will: + +* Apply manually set knobs +* Read its local configuration file + * If the stored configuration path does not match the configuration path + specified on the command line, delete the local configuration file + * Otherwise, apply knob updates from the local configuration file. Manually + specified knobs will not be overridden + * Register with the broadcaster to receive new updates for its configuration + classes + * Persist these updates when received and restart if necessary + +### Knob Atomicity + +All knobs are classified as either atomic or non-atomic. Atomic knobs require a +process restart when changed, while non-atomic knobs do not. + +### Compaction + +``ConfigNode``s store individual mutations in order to be able to update other, +out of date ``ConfigNode``s without needing to send a full snapshot. Each +configuration database commit also contains additional metadata such as a +timestamp and a text description of the changes being made. To keep the size of +the configuration database manageable, a compaction process runs periodically +(defaulting to every five minutes) which compacts individual mutations into a +simplified snapshot of key-value pairs. Compaction is controlled by the +``ConfigBroadcaster``, using information it peridiodically requests from +``ConfigNode``s. Compaction will only compact up to the minimum known version +across *all* ``ConfigNode``s. This means that if one ``ConfigNode`` is +permanently partitioned from the ``ConfigBroadcaster`` or from clients, no +compaction will ever take place. + +### Rollback / Rollforward + +It is necessary to be able to roll ``ConfigNode``s backward and forward with +respect to their committed versions due to the nature of quorum logic and +unreliable networks. + +Consider a case where a client commit gets persisted durably on one out of +three ``ConfigNode``s (assume commit messages to the other two nodes are lost). +Since the value is not committed on a majority of ``ConfigNode``s, it cannot be +considered committed. But it is also incorrect to have the value persist on one +out of three nodes as future commits are made. In this case, the most common +result is that the ``ConfigNode`` will be rolled back when the next commit from +a different client is made, and then rolled forward to contain the data from +the commit. ``PaxosConfigConsumer`` contains logic to recognize ``ConfigNode`` +minorities and update them to match the quorum. + +### Changing Coordinators + +Since the configuration database lives on the coordinators and the +[coordinators can be +changed](https://apple.github.io/foundationdb/configuration.html#configuration-changing-coordination-servers), +it is necessary to copy the configuration database from the old to the new +coordinators during such an event. A coordinator change performs the following +steps in regards to the configuration database: + +1. Write ``\xff/coordinatorsKey`` with the new coordinators string. The key + ``\xff/previousCoordinators`` contains the current (old) set of + coordinators. +2. Lock the old ``ConfigNode``s so they can no longer serve client requests. +3. Start a recovery, causing a new cluster controller (and therefore + ``ConfigBroadcaster``) to be selected. +4. Read ``\xff/previousCoordinators`` on the ``ConfigBroadcaster`` and, if + present, read an up-to-date snapshot of the configuration database on the + old coordinators. +5. Determine if each registering ``ConfigNode`` needs an up-to-date snapshot of + the configuration database sent to it, based on its reported version and the + snapshot version of the database received from the old coordinators. + * Some new coordinators which were also coordinators in the previous + configuration may not need a snapshot. +6. Send ready requests to new ``ConfigNode``s, including an up-to-date snapshot + if necessary. This allows the new coordinators to begin serving + configuration database requests from clients. + +## Testing + +The ``ConfigDatabaseUnitTests`` class unit test a number of different +configuration database dimensions. + +The ``ConfigIncrement`` workload tests contention between clients attempting to +write to the configuration database, paired with machine failure and +coordinator changes. diff --git a/design/global-tag-throttling.md b/design/global-tag-throttling.md index fd2a2d9390..ad7750a3cc 100644 --- a/design/global-tag-throttling.md +++ b/design/global-tag-throttling.md @@ -3,44 +3,48 @@ When the `GLOBAL_TAG_THROTTLING` knob is enabled, the ratekeeper will use the [transaction tagging feature](https://apple.github.io/foundationdb/transaction-tagging.html) to throttle tags according to the global tag throttling algorithm. This page describes the implementation of this algorithm. ### Tag Quotas -The global tag throttler bases throttling decisions on "quotas" provided by clients through the tag quota API. Each tag quota has four different components: +The global tag throttler bases throttling decisions on "quotas" provided by clients through the tag quota API. Each tag quota has two components: -* Reserved read quota -* Reserved write quota -* Total read quota -* Total write quota +* Reserved quota +* Total quota -The global tag throttler can not throttle tags to a throughput below the reserved quotas, and it cannot allow throughput to exceed the total quotas. +The global tag throttler cannot throttle tags to a throughput below the reserved quota, and it cannot allow throughput to exceed the total quota. ### Cost -The units for these quotas are computed as follows. The "cost" of a read operation is computed as: +Internally, the units for these quotas are "page costs", computed as follows. The "page cost" of a read operation is computed as: ``` -readCost = bytesRead / SERVER_KNOBS->READ_COST_BYTE_FACTOR + 1; +readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR); ``` -The "cost" of a write operation is computed as: +The "page cost" of a write operation is computed as: ``` -writeCost = bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1; +writeCost = SERVER_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR); ``` Here `bytesWritten` includes cleared bytes. The size of range clears is estimated at commit time. ### Tuple Layer -Tag quotas are stored inside of the system keyspace (with prefix `\xff/tagQuota/`). They are stored using the tuple layer, in a tuple of form: `(reservedReadQuota, totalReadQuota, reservedWriteQuota, totalWriteQuota)`. There is currently no custom code in the bindings for manipulating these system keys. However, in any language for which bindings are available, it is possible to use the tuple layer to manipulate tag quotas. +Tag quotas are stored inside of the system keyspace (with prefix `\xff/tagQuota/`). They are stored using the tuple layer, in a tuple of form: `(reservedQuota, totalQuota)`. There is currently no custom code in the bindings for manipulating these system keys. However, in any language for which bindings are available, it is possible to use the tuple layer to manipulate tag quotas. ### fdbcli -The easiest way for an external client to interact with tag quotas is through `fdbcli`. To get the quota of a particular tag, run the following command: +The easiest way for an external client to interact with tag quotas is through `fdbcli`. To get the quota (in bytes/second) of a particular tag, run the following command: ``` -fdbcli> get [reserved|total] [read|write] +fdbcli> quota get [reserved_throughput|total_throughput] ``` To set the quota through `fdbcli`, run: ``` -fdbcli> set [reserved|total] [read|write] +fdbcli> quota set [reserved_throughput|total_throughput] +``` + +Note that the quotas are specified in terms of bytes/second, and internally converted to page costs: + +``` +page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR) ``` ### Limit Calculation @@ -125,20 +129,3 @@ In each test, the `GlobalTagThrottlerTesting::monitor` function is used to perio On the ratekeeper, every `SERVER_KNOBS->TAG_THROTTLE_PUSH_INTERVAL` seconds, the ratekeeper will call `GlobalTagThrottler::getClientRates`. At the end of the rate calculation for each tag, a trace event of type `GlobalTagThrottler_GotClientRate` is produced. This trace event reports the relevant inputs that went in to the rate calculation, and can be used for debugging. On storage servers, every `SERVER_KNOBS->TAG_MEASUREMENT_INTERVAL` seconds, there are `BusyReadTag` events for every tag that has sufficient read cost to be reported to the ratekeeper. Both cost and fractional busyness are reported. - -### Status -For each storage server, the busiest read tag is reported in the full status output, along with its cost and fractional busyness. - -At path `.cluster.qos.global_tag_throttler`, throttling limitations for each tag are reported: - -``` -{ - "": { - "desired_tps": , - "reserved_tps": , - "limiting_tps": [|"unset"], - "target_tps": - }, - ... -} -``` diff --git a/design/special-key-space.md b/design/special-key-space.md index 7cdcfe460d..be104915fe 100644 --- a/design/special-key-space.md +++ b/design/special-key-space.md @@ -32,10 +32,10 @@ public: explicit SKRExampleImpl(KeyRangeRef kr): SpecialKeyRangeReadImpl(kr) { // Our implementation is quite simple here, the key-value pairs are formatted as: // \xff\xff/example/ : - CountryToCapitalCity[LiteralStringRef("USA")] = LiteralStringRef("Washington, D.C."); - CountryToCapitalCity[LiteralStringRef("UK")] = LiteralStringRef("London"); - CountryToCapitalCity[LiteralStringRef("Japan")] = LiteralStringRef("Tokyo"); - CountryToCapitalCity[LiteralStringRef("China")] = LiteralStringRef("Beijing"); + CountryToCapitalCity["USA"_sr] = "Washington, D.C."_sr; + CountryToCapitalCity["UK"_sr] = "London"_sr; + CountryToCapitalCity["Japan"_sr] = "Tokyo"_sr; + CountryToCapitalCity["China"_sr] = "Beijing"_sr; } // Implement the getRange interface Future getRange(ReadYourWritesTransaction* ryw, @@ -58,7 +58,7 @@ private: }; // Instantiate the function object // In development, you should have a function object pointer in DatabaseContext(DatabaseContext.h) and initialize in DatabaseContext's constructor(NativeAPI.actor.cpp) -const KeyRangeRef exampleRange(LiteralStringRef("\xff\xff/example/"), LiteralStringRef("\xff\xff/example/\xff")); +const KeyRangeRef exampleRange("\xff\xff/example/"_sr, "\xff\xff/example/\xff"_sr); SKRExampleImpl exampleImpl(exampleRange); // Assuming the database handler is `cx`, register to special-key-space // In development, you should register all function objects in the constructor of DatabaseContext(NativeAPI.actor.cpp) @@ -67,16 +67,16 @@ cx->specialKeySpace->registerKeyRange(exampleRange, &exampleImpl); state ReadYourWritesTransaction tr(cx); // get Optional res1 = wait(tr.get("\xff\xff/example/Japan")); -ASSERT(res1.present() && res.getValue() == LiteralStringRef("Tokyo")); +ASSERT(res1.present() && res.getValue() == "Tokyo"_sr); // getRange // Note: for getRange(key1, key2), both key1 and key2 should prefixed with \xff\xff // something like getRange("normal_key", "\xff\xff/...") is not supported yet -RangeResult res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff"))); +RangeResult res2 = wait(tr.getRange("\xff\xff/example/U"_sr, "\xff\xff/example/U\xff"_sr)); // res2 should contain USA and UK ASSERT( res2.size() == 2 && - res2[0].value == LiteralStringRef("London") && - res2[1].value == LiteralStringRef("Washington, D.C.") + res2[0].value == "London"_sr && + res2[1].value == "Washington, D.C."_sr ); ``` diff --git a/documentation/sphinx/conf.py b/documentation/sphinx/conf.py index 031c7d6f67..04ed43d87b 100644 --- a/documentation/sphinx/conf.py +++ b/documentation/sphinx/conf.py @@ -69,7 +69,7 @@ release = root.find(".//{http://schemas.microsoft.com/developer/msbuild/2003}Ver # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -language = None +language = 'en' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -185,7 +185,7 @@ html_show_copyright = True htmlhelp_basename = 'FoundationDB' # Disable permalinks -html_add_permalinks = "" +html_permalinks = False # -- Options for LaTeX output -------------------------------------------------- diff --git a/documentation/sphinx/extensions/rubydomain.py b/documentation/sphinx/extensions/rubydomain.py index 1e5fb0bce4..e6c849cc41 100755 --- a/documentation/sphinx/extensions/rubydomain.py +++ b/documentation/sphinx/extensions/rubydomain.py @@ -42,7 +42,7 @@ from docutils.parsers.rst import directives, Directive from sphinx import addnodes from sphinx.roles import XRefRole -from sphinx.locale import l_, _ +from sphinx.locale import _ from sphinx.domains import Domain, ObjType, Index from sphinx.directives import ObjectDescription from sphinx.util.nodes import make_refnode @@ -83,18 +83,18 @@ class RubyObject(ObjectDescription): } doc_field_types = [ - TypedField('parameter', label=l_('Parameters'), + TypedField('parameter', label=_('Parameters'), names=('param', 'parameter', 'arg', 'argument'), typerolename='obj', typenames=('paramtype', 'type')), - TypedField('variable', label=l_('Variables'), rolename='obj', + TypedField('variable', label=_('Variables'), rolename='obj', names=('var', 'ivar', 'cvar'), typerolename='obj', typenames=('vartype',)), - GroupedField('exceptions', label=l_('Raises'), rolename='exc', + GroupedField('exceptions', label=_('Raises'), rolename='exc', names=('raises', 'raise', 'exception', 'except'), can_collapse=True), - Field('returnvalue', label=l_('Returns'), has_arg=False, + Field('returnvalue', label=_('Returns'), has_arg=False, names=('returns', 'return')), - Field('returntype', label=l_('Return type'), has_arg=False, + Field('returntype', label=_('Return type'), has_arg=False, names=('rtype',)), ] @@ -493,8 +493,8 @@ class RubyModuleIndex(Index): """ name = 'modindex' - localname = l_('Ruby Module Index') - shortname = l_('modules') + localname = _('Ruby Module Index') + shortname = _('modules') def generate(self, docnames=None): content = {} @@ -561,17 +561,17 @@ class RubyDomain(Domain): name = 'rb' label = 'Ruby' object_types = { - 'function': ObjType(l_('function'), 'func', 'obj'), - 'global': ObjType(l_('global variable'), 'global', 'obj'), - 'method': ObjType(l_('method'), 'meth', 'obj'), - 'class': ObjType(l_('class'), 'class', 'obj'), - 'exception': ObjType(l_('exception'), 'exc', 'obj'), - 'classmethod': ObjType(l_('class method'), 'meth', 'obj'), - 'attr_reader': ObjType(l_('attribute'), 'attr', 'obj'), - 'attr_writer': ObjType(l_('attribute'), 'attr', 'obj'), - 'attr_accessor': ObjType(l_('attribute'), 'attr', 'obj'), - 'const': ObjType(l_('const'), 'const', 'obj'), - 'module': ObjType(l_('module'), 'mod', 'obj'), + 'function': ObjType(_('function'), 'func', 'obj'), + 'global': ObjType(_('global variable'), 'global', 'obj'), + 'method': ObjType(_('method'), 'meth', 'obj'), + 'class': ObjType(_('class'), 'class', 'obj'), + 'exception': ObjType(_('exception'), 'exc', 'obj'), + 'classmethod': ObjType(_('class method'), 'meth', 'obj'), + 'attr_reader': ObjType(_('attribute'), 'attr', 'obj'), + 'attr_writer': ObjType(_('attribute'), 'attr', 'obj'), + 'attr_accessor': ObjType(_('attribute'), 'attr', 'obj'), + 'const': ObjType(_('const'), 'const', 'obj'), + 'module': ObjType(_('module'), 'mod', 'obj'), } directives = { diff --git a/documentation/sphinx/requirements.txt b/documentation/sphinx/requirements.txt index 06e23ea6d3..8e33b564f2 100644 --- a/documentation/sphinx/requirements.txt +++ b/documentation/sphinx/requirements.txt @@ -1,6 +1,6 @@ --index-url https://pypi.python.org/simple -setuptools>=20.10.0,<=57.4.0 -sphinx==1.5.6 -sphinx-bootstrap-theme==0.4.8 -docutils==0.16 -Jinja2==3.0.3 +setuptools==65.3.0 +sphinx==5.1.1 +sphinx-bootstrap-theme==0.8.1 +docutils==0.19 +Jinja2==3.1.2 diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst index 424d9a6f2a..018ffd66c5 100644 --- a/documentation/sphinx/source/administration.rst +++ b/documentation/sphinx/source/administration.rst @@ -12,6 +12,7 @@ Administration configuration moving-a-cluster tls + authorization This document covers the administration of an existing FoundationDB cluster. We recommend you read this document before setting up a cluster for performance testing or production use. diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst index 7e14d5c675..3ec3fca9bf 100644 --- a/documentation/sphinx/source/api-c.rst +++ b/documentation/sphinx/source/api-c.rst @@ -222,7 +222,7 @@ The FoundationDB client library performs most tasks on a singleton thread (which Future ====== -Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return :type:`FDBFuture*`. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which :func:`fdb_future_get_*()` function you should call). +Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return ``FDBFuture*``. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which ``fdb_future_get_()`` function you should call). To use the API in a synchronous way, you would typically do something like this for each asynchronous call:: @@ -282,7 +282,7 @@ See :ref:`developer-guide-programming-with-futures` for further (language-indepe .. type:: FDBCallback - A pointer to a function which takes :type:`FDBFuture*` and ``void*`` and returns ``void``. + A pointer to a function which takes ``FDBFuture*`` and ``void*`` and returns ``void``. .. function:: void fdb_future_release_memory(FDBFuture* future) @@ -298,13 +298,13 @@ See :ref:`developer-guide-programming-with-futures` for further (language-indepe .. function:: fdb_error_t fdb_future_get_int64(FDBFuture* future, int64_t* out) - Extracts a 64-bit integer from an :type:`FDBFuture*` into a caller-provided variable of type ``int64_t``. |future-warning| + Extracts a 64-bit integer from a pointer to :type:`FDBFuture` into a caller-provided variable of type ``int64_t``. |future-warning| |future-get-return1| |future-get-return2|. .. function:: fdb_error_t fdb_future_get_key_array( FDBFuture* f, FDBKey const** out_key_array, int* out_count) - Extracts an array of :type:`FDBKey` from an :type:`FDBFuture*` into a caller-provided variable of type ``FDBKey*``. The size of the array will also be extracted and passed back by a caller-provided variable of type ``int`` |future-warning| + Extracts an array of :type:`FDBKey` from an ``FDBFuture*`` into a caller-provided variable of type ``FDBKey*``. The size of the array will also be extracted and passed back by a caller-provided variable of type ``int`` |future-warning| |future-get-return1| |future-get-return2|. @@ -547,13 +547,13 @@ Applications must provide error handling and an appropriate retry loop around th .. function:: void fdb_transaction_set_read_version(FDBTransaction* transaction, int64_t version) - Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_transaction_too_old; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of :func:`fdb_transaction_get_*()` have been called on this transaction already, the result is undefined. + Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_transaction_too_old; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of ``fdb_transaction_get_*()`` have been called on this transaction already, the result is undefined. .. function:: FDBFuture* fdb_transaction_get_read_version(FDBTransaction* transaction) |future-return0| the transaction snapshot read version. |future-return1| call :func:`fdb_future_get_int64()` to extract the version into an int64_t that you provide, |future-return2| - The transaction obtains a snapshot read version automatically at the time of the first call to :func:`fdb_transaction_get_*()` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call. + The transaction obtains a snapshot read version automatically at the time of the first call to ``fdb_transaction_get_*()`` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call. .. function:: FDBFuture* fdb_transaction_get(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, fdb_bool_t snapshot) @@ -829,7 +829,7 @@ Applications must provide error handling and an appropriate retry loop around th |future-returnvoid| - Callers will usually want to retry a transaction if the commit or a prior :func:`fdb_transaction_get_*()` returns a retryable error (see :func:`fdb_transaction_on_error()`). + Callers will usually want to retry a transaction if the commit or a prior ``fdb_transaction_get_*()`` returns a retryable error (see :func:`fdb_transaction_on_error()`). |commit-unknown-result-blurb| @@ -878,9 +878,9 @@ Applications must provide error handling and an appropriate retry loop around th .. function:: FDBFuture* fdb_transaction_on_error(FDBTransaction* transaction, fdb_error_t error) - Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other :func:`fdb_transaction_*()` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions. + Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other ``fdb_transaction_*()`` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions. - On receiving any type of error from an :func:`fdb_transaction_*()` function, the application should: + On receiving any type of error from an ``fdb_transaction_*()`` function, the application should: 1. Call :func:`fdb_transaction_on_error()` with the returned :type:`fdb_error_t` code. @@ -963,15 +963,15 @@ Key selectors In the FoundationDB C API, key selectors are not represented by a structure of any kind, but are instead expressed as sequential parameters to |get-key-func| and |get-range-func|. For convenience, the most common key selectors are available as C macros that expand to the appropriate parameters. -.. function:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length) +.. type:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length) -.. function:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length) +.. type:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length) -.. function:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length) +.. type:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length) -.. function:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length) +.. type:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length) -To use one of these macros, simply replace the four parameters in the function with one of :func:`FDB_KEYSEL_*`:: +To use one of these macros, simply replace the four parameters in the function with one of ``FDB_KEYSEL_*``:: future = fdb_transaction_get_key(transaction, "key", 3, 0, 2, 0); diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst index c024c7d2d5..2577bdfc5f 100644 --- a/documentation/sphinx/source/api-python.rst +++ b/documentation/sphinx/source/api-python.rst @@ -194,10 +194,6 @@ After importing the ``fdb`` module and selecting an API version, you probably wa |option-tls-key-bytes| - .. method :: fdb.options.set_tls_verify_peers(verification_pattern) - - |option-tls-verify-peers| - .. method :: fdb.options.set_tls_ca_bytes(ca_bundle) |option-tls-ca-bytes| @@ -210,10 +206,6 @@ After importing the ``fdb`` module and selecting an API version, you probably wa |option-tls-password| - .. method :: fdb.options.set_disable_multi_version_client_api() - - |option-disable-multi-version-client-api| - .. method :: fdb.options.set_disable_local_client() |option-set-disable-local-client| @@ -761,10 +753,6 @@ In each of the methods below, ``param`` should be a string appropriately packed Committing ---------- -.. decorator:: transactional() - - The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional ` for explanation and examples. - .. method :: Transaction.commit() Attempt to commit the changes made in the transaction to the database. Returns a :class:`FutureVoid` representing the asynchronous result of the commit. You **must** call the :meth:`Future.wait()` method on the returned :class:`FutureVoid`, which will raise an exception if the commit failed. diff --git a/documentation/sphinx/source/architecture.rst b/documentation/sphinx/source/architecture.rst index 7c28518d74..f693865430 100644 --- a/documentation/sphinx/source/architecture.rst +++ b/documentation/sphinx/source/architecture.rst @@ -14,8 +14,12 @@ Detailed FoundationDB Architecture The FoundationDB architecture chooses a decoupled design, where processes are assigned different heterogeneous roles (e.g., -Coordinators, Storage Servers, Master). Scaling the database is achieved -by horizontally expanding the number of processes for separate roles: +Coordinators, Storage Servers, Master). Cluster attempts to recruit +different roles as separate processes, however, it is possible that +multiple Stateless roles gets colocated (recruited) on a single +process to meet the cluster recruitment goals. Scaling the database +is achieved by horizontally expanding the number of processes for +separate roles: Coordinators ~~~~~~~~~~~~ diff --git a/documentation/sphinx/source/authorization.rst b/documentation/sphinx/source/authorization.rst new file mode 100644 index 0000000000..ce25df1057 --- /dev/null +++ b/documentation/sphinx/source/authorization.rst @@ -0,0 +1,124 @@ +############# +Authorization +############# + +.. warning :: Authorization is currently experimental and is not recommended for use in production. + +Introduction +============ + +:ref:`Multi-tenant ` database implies a couple of new concepts that did not previously exist in FoundationDB. +The first is the concept of privilege levels: we have *data-plane clients* whose typical workload is limited to accessing a tenant keyspace. +On the other hand, we have *control-plane clients* or *administrators* who may read or update cluster-wide configurations through system keyspace. +These operations also include creation and deletion of tenants. +The second is access control: with multiple tenant keyspaces, it comes naturally that we would want to restrict database access of a client to a subset of them. + +Privilege Levels +---------------- + +Authorization feature extends FoundationDB's existing TLS policy to distinguish administrators from data-plane clients, +making TLS configuration a prerequisite for enabling authorization. +There are only two privilege levels: *trusted* versus *untrusted* clients. +Trusted clients are authorized to perform any operation that pre-authorization FoundationDB clients used to perform, including those accessing the system keyspace. +Untrusted clients may only request what is necessary to access tenant keyspaces for which they are authorized. +Untrusted clients are blocked from accessing anything in the system keyspace or issuing management operations that modify the cluster in any way. + +In order to be considered a trusted client, a client needs to be :ref:`configured with a valid chain of X.509 certificates and a private key `, +and its certificate chain must be trusted by the server. In other words, a client must successfully complete a mutual TLS authentication. +Additionally, if the server was configured with trusted IP subnets, i.e. run with one or more ``--trusted-subnet-SUBNET_NAME`` followed by a CIDR block describing the subnet, +then the client's IP as seen from the server must belong to at least one of the subnets. + +Choosing to respond with an empty certificate chain during `client authentication `_ marks the client as untrusted. +If the server specifies a list of trusted subnets and the client's server-facing IP is not part of any of the subnets, +then the client is untrusted even if it successfully completes a mutual TLS authentication. + +.. note:: Presenting a bad or untrusted certificate chain causes the server to break the client connection and eventually throttle the client. + It does not let the client connect untrusted. + +Access Control +-------------- + +To restrict untrusted client's database access to a subset of tenant keyspaces, authorization feature allows database administrators +to grant tenant-scoped access in the form of `JSON Web Tokens `_. +Token verification is performed against a set of named public keys written in `JWK Set `_ format. +A token's header part must contain the `key identifier `_ of the public key which shall be used to verify the token itself. +Below is the list of token fields recognized by FoundationDB. +Note that some of the fields are *recognized* by FoundationDB but not *actively used* in enforcing security, pending future implementation. +Those fields are marked as **NOT required**. + + +.. table:: JSON Web Token Fields supported by FoundationDB + :align: left + :widths: auto + + =============== =========== ======== ==================================================== ================================================================================ + Containing Part Field Name Required Purpose Reference + =============== =========== ======== ==================================================== ================================================================================ + Header ``typ`` Yes Type of JSON Web Signature. Must be ``JWT``. `RFC7519 Section 5.1 `_ + Header ``alg`` Yes Algorithm used to generate the signature. Only `RFC7515 Section 4.1.1 `_ + ``ES256`` and ``RS256`` are supported. + Must match the ``alg`` attribute of public key. + Header ``kid`` Yes Name of public key with which to verify the token. `RFC7515 Section 4.1.4 `_ + Must match the ``kid`` attribute of public key. + Claim ``exp`` Yes Timestamp after which token is not accepted. `RFC7519 Section 4.1.4 `_ + Claim ``nbf`` Yes Timestamp before which token is not accepted. `RFC7519 Section 4.1.5 `_ + Claim ``iat`` Yes Timestamp at which token was issued. `RFC7519 Section 4.1.6 `_ + Claim ``tenants`` Yes Tenants names for which token holder is authorized. N/A + Must be an array. + Claim ``iss`` No Issuer of the token. `RFC7519 Section 4.1.1 `_ + Claim ``sub`` No Subject of the token. `RFC7519 Section 4.1.2 `_ + Claim ``aud`` No Intended recipients of the token. Must be an array. `RFC7519 Section 4.1.3 `_ + Claim ``jti`` No String that uniquely identifies a token. `RFC7519 Section 4.1.7 `_ + =============== =========== ======== ==================================================== ================================================================================ + +Public keys with which to verify the token must be serialized in `JWK Set `_ format and stored in a file. +The location of the key set file must be passed as command line argument ``--authorization-public-key-file`` to the ``fdbserver`` executable. +Public keys in the set must be either `RSA `_ public keys +containing ``n`` and ``e`` parameters, each containing `Base64urlUInt `_-encoded modulus and exponent, +or `Elliptic Curve `_ public keys on a ``P-256`` curve, +where ``crv`` parameter is set to ``P-256`` and ``x`` and ``y`` parameters contain +`base64url `_-encoded affine coordinates. +In addition, each public key JSON object in set must contain ``kty`` (set to either ``EC`` or ``RSA``) field to indicate public key algorithm, +along with ``kid``, and ``alg`` fields to be compared against their token header counterparts. +Private keys are strongly recommended against being included in the public key set and, if found, are excluded from consideration. + +.. note:: By design, FoundationDB authorization feature does not support revocation of outstanding tokens. + Use extra caution in signing tokens with long token durations. + +Enabling Clients to use Authorization Tokens +============================================ + +In order to use an untrusted client with an authorization token, a client must be configured to trust the server's CA, +but must not be configured to use the client's own certificates and keys. +More concretely, the client's ``TLS_CA_FILE`` must include the server's root CA certificate, +but the client must not be configured with its own ``TLS_CERTIFICATE_FILE`` or ``TLS_KEY_FILE``, neither programmatically nor by environment variable. +Before performing a tenant data read or update, a client must set ``AUTHORIZATION_TOKEN`` transaction option with the token string as argument. +It is the client's responsibility to keep the token up-to-date, by timely assigning a new token to the transaction object. + +.. note:: The TLS authentication mode of an untrusted client is similar to how typical web browsers connect to TLS-enabled web services. + They authenticate the server using their bundle of trusted root CA certificates, + but they do not authenticate themselves to the server. + +Public Key Rotation +=================== + +FoundationDB's internal public key set automatically refreshes itself based on the key set file's latest content every ``PUBLIC_KEY_FILE_REFRESH_INTERVAL_SECONDS`` seconds. +The in-memory set of public keys does not update unless the key file holds a correct `JWK Set`_. + +Token Caching +============= + +In a single-threaded runtime environment such as FoundationDB, it is important not to let the main thread be overloaded with computationally expensive operations, +such as token signature verification. FoundationDB internally caches the tokens that are considered valid at the time of verification in a fixed-size cache, +whose size may be configured using ``TOKEN_CACHE_SIZE`` knob. + +.. note:: Token cache is independent of the active public key set. Once the token reaches the cache, it is valid until its expiration time, + regardless of any key rotation that takes place thereafter. + +Allowing Untrusted Clients to Access Tenant Data Without Tokens +=============================================================== + +Rolling out a public key distribution infrastructure and an authorization-enabled FoundationDB cluster in lockstep might not be feasible with large scale distributed systems. +To support incremental rollout, authorization feature introduces ``ALLOW_TOKENLESS_TENANT_ACCESS`` boolean knob, +which preserves the TLS-based privilege level policy without untrusted clients having to set authorization tokens to their transactions in order to access tenant data. +With this knob active, any authorization token assigned to the client transaction is simply ignored. diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index 09969a0640..5f413d6c98 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -64,7 +64,7 @@ The ``commit`` command commits the current transaction. Any sets or clears execu configure --------- -The ``configure`` command changes the database configuration. Its syntax is ``configure [new|tss] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [grv_proxies=] [commit_proxies=] [resolvers=] [logs=] [count=] [perpetual_storage_wiggle=] [perpetual_storage_wiggle_locality=<:|0>] [storage_migration_type={disabled|aggressive|gradual}] [tenant_mode={disabled|optional_experimental|required_experimental}]``. +The ``configure`` command changes the database configuration. Its syntax is ``configure [new|tss] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [grv_proxies=] [commit_proxies=] [resolvers=] [logs=] [count=] [perpetual_storage_wiggle=] [perpetual_storage_wiggle_locality=<:|0>] [storage_migration_type={disabled|aggressive|gradual}] [tenant_mode={disabled|optional_experimental|required_experimental}] [encryption_at_rest_mode={aes_256_ctr|disabled}]``. The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When ``new`` is used, both a redundancy mode and a storage engine must be specified. @@ -153,13 +153,6 @@ If ``description=`` is specified, the description field in the cluster fil For more information on setting the cluster description, see :ref:`configuration-setting-cluster-description`. -createtenant ------------- - -The ``createtenant`` command is used to create new tenants in the cluster. Its syntax is ``createtenant ``. - -The tenant name can be any byte string that does not begin with the ``\xff`` byte. If the tenant already exists, ``fdbcli`` will report an error. - defaulttenant ------------- @@ -167,13 +160,6 @@ The ``defaulttenant`` command configures ``fdbcli`` to run its commands without The active tenant cannot be changed while a transaction (using ``begin``) is open. -deletetenant ------------- - -The ``deletetenant`` command is used to delete tenants from the cluster. Its syntax is ``deletetenant ``. - -In order to delete a tenant, it must be empty. To delete a tenant with data, first clear that data using the ``clear`` command. If the tenant does not exist, ``fdbcli`` will report an error. - exclude ------- @@ -231,33 +217,8 @@ The ``getrangekeys`` command fetches keys in a range. Its syntax is ``getrangeke Note that :ref:`characters can be escaped ` when specifying keys (or values) in ``fdbcli``. -gettenant ---------- - -The ``gettenant`` command fetches metadata for a given tenant and displays it. Its syntax is ``gettenant [JSON]``. - -Included in the output of this command are the ``id`` and ``prefix`` assigned to the tenant. If the tenant does not exist, ``fdbcli`` will report an error. If ``JSON`` is specified, then the output will be written as a JSON document:: - - { - "tenant": { - "id": 0, - "prefix": { - "base64": "AAAAAAAAAAU=", - "printable": "\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x05", - } - }, - "type": "success" - } - -In the event of an error, the output will include an error message:: - - { - "error": "...", - "type": "error" - } - - getversion - ---------- +getversion +---------- The ``getversion`` command fetches the current read version of the cluster or currently running transaction. @@ -346,13 +307,6 @@ Attempts to kill all specified processes. Each address should include the IP and Attempts to kill all known processes in the cluster. -listtenants ------------ - -The ``listtenants`` command prints the names of tenants in the cluster. Its syntax is ``listtenants [BEGIN] [END] [LIMIT]``. - -By default, the ``listtenants`` command will print up to 100 entries from the entire range of tenants. A narrower sub-range can be printed using the optional ``[BEGIN]`` and ``[END]`` parameters, and the limit can be changed by specifying an integer ``[LIMIT]`` parameter. - lock ---- @@ -417,13 +371,6 @@ heap Enables heap profiling for the specified process. -renametenant ------------- - -The ``renametenant`` command can rename an existing tenant to a new name. Its syntax is ``renametenant ``. - -This command requires that ``OLD_NAME`` is a tenant that already exists on the cluster, and that ``NEW_NAME`` is not already a name of a tenant in the cluster. - reset ----- @@ -484,6 +431,143 @@ status json .. _cli-throttle: +tenant +------ + +The ``tenant`` command is used to view and manage the tenants in a cluster. The ``tenant`` command has the following subcommands: + +create +^^^^^^ + +``tenant create [tenant_group=] [assigned_cluster=]`` + +Creates a new tenant in the cluster. + +``NAME`` - The desired name of the tenant. The name can be any byte string that does not begin with the ``\xff`` byte. + +``TENANT_GROUP`` - The tenant group the tenant will be placed in. + +``CLUSTER_NAME`` - The cluster the tenant will be placed in (metacluster only). If unspecified, the metacluster will choose the cluster. + +delete +^^^^^^ + +``tenant delete `` + +Deletes a tenant from the cluster. The tenant must be empty. + +``NAME`` - the name of the tenant to delete. + +list +^^^^ + +``tenant list [BEGIN] [END] [LIMIT]`` + +Lists the tenants present in the cluster. + +``BEGIN`` - the first tenant to list. Defaults to the empty tenant name ``""``. + +``END`` - the exclusive end tenant to list. Defaults to ``\xff\xff``. + +``LIMIT`` - the number of tenants to list. Defaults to 100. + +get +^^^ + +``tenant get [JSON]`` + +Prints the metadata for a tenant. + +``NAME`` - the name of the tenant to print. + +``JSON`` - if specified, the output of the command will be printed in the form of a JSON string:: + + { + "tenant": { + "id": 0, + "prefix": { + "base64": "AAAAAAAAAAU=", + "printable": "\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x05", + } + }, + "type": "success" + } + +In the event of an error, the JSON output will include an error message:: + + { + "error": "...", + "type": "error" + } + +configure +^^^^^^^^^ + +``tenant configure <[unset] tenant_group[=GROUP_NAME]>`` + +Changes the configuration of a tenant. + +``TENANT_NAME`` - the name of the tenant to reconfigure. + +The following tenant fields can be configured: + +``tenant_group`` - changes the tenant group a tenant is assigned to. If ``unset`` is specified, the tenant will be configured to not be in a group. Otherwise, ``GROUP_NAME`` must be specified to the new group that the tenant should be made a member of. + +rename +^^^^^^ + +``tenant rename `` + +Changes the name of an existing tenant. + +``OLD_NAME`` - the name of the tenant being renamed. + +``NEW_NAME`` - the desired name of the tenant. This name must not already be in use. + + +tenantgroup +----------- + +The ``tenantgroup`` command is used to view details about the tenant groups in a cluster. The ``tenantgroup`` command has the following subcommands: + +list +^^^^ + +``tenantgroup list [BEGIN] [END] [LIMIT]`` + +Lists the tenant groups present in the cluster. + +``BEGIN`` - the first tenant group to list. Defaults to the empty tenant group name ``""``. + +``END`` - the exclusive end tenant group to list. Defaults to ``\xff\xff``. + +``LIMIT`` - the number of tenant groups to list. Defaults to 100. + +get +^^^ + +``tenantgroup get [JSON]`` + +Prints the metadata for a tenant group. + +``NAME`` - the name of the tenant group to print. + +``JSON`` - if specified, the output of the command will be printed in the form of a JSON string:: + + { + "tenant_group": { + "assigned_cluster": "cluster1", + }, + "type": "success" + } + +In the event of an error, the JSON output will include an error message:: + + { + "error": "...", + "type": "error" + } + throttle -------- diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 699c811139..5d52d40910 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -416,6 +416,9 @@ FoundationDB will never use processes on the same machine for the replication of ``three_data_hall`` mode FoundationDB stores data in triplicate, with one copy on a storage server in each of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration enables the cluster to remain available after losing a single data hall and one machine in another data hall. +``three_data_hall_fallback`` mode + FoundationDB stores data in duplicate, with one copy each on a storage server in two of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration is similar to ``three_data_hall``, differing only in that data is stored on two instead of three replicas. This configuration is useful to unblock data distribution when a data hall becomes temporarily unavailable. Because ``three_data_hall_fallback`` reduces the redundancy level to two, it should only be used as a temporary measure to restore cluster health during a datacenter outage. + Datacenter-aware mode --------------------- diff --git a/documentation/sphinx/source/data-modeling.rst b/documentation/sphinx/source/data-modeling.rst index 146b006809..5972c5110a 100644 --- a/documentation/sphinx/source/data-modeling.rst +++ b/documentation/sphinx/source/data-modeling.rst @@ -1,7 +1,6 @@ .. default-domain:: py .. default-domain:: py .. highlight:: python -.. module:: fdb .. Required substitutions for api-common.rst.inc diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index 3bf5ec30c0..c889e4909b 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -1,7 +1,6 @@ .. default-domain:: py .. default-domain:: py .. highlight:: python -.. module:: fdb .. Required substitutions for api-common.rst.inc diff --git a/documentation/sphinx/source/global-configuration.rst b/documentation/sphinx/source/global-configuration.rst index 1c5f94dc7d..663ad26eb4 100644 --- a/documentation/sphinx/source/global-configuration.rst +++ b/documentation/sphinx/source/global-configuration.rst @@ -82,7 +82,7 @@ Values must always be encoded according to the :ref:`api-python-tuple-layer`. // In GlobalConfig.actor.h extern const KeyRef myGlobalConfigKey; // In GlobalConfig.actor.cpp - const KeyRef myGlobalConfigKey = LiteralStringRef("config/key"); + const KeyRef myGlobalConfigKey = "config/key"_sr; // When you want to set the value.. Tuple value = Tuple::makeTuple((double)1.5); diff --git a/documentation/sphinx/source/index.rst b/documentation/sphinx/source/index.rst index 40c7a76279..167bebda43 100644 --- a/documentation/sphinx/source/index.rst +++ b/documentation/sphinx/source/index.rst @@ -50,6 +50,7 @@ The latest changes are detailed in :ref:`release-notes`. The documentation has t :hidden: local-dev + internal-dev-tools why-foundationdb technical-overview client-design diff --git a/documentation/sphinx/source/internal-dev-tools.rst b/documentation/sphinx/source/internal-dev-tools.rst new file mode 100644 index 0000000000..ea80947312 --- /dev/null +++ b/documentation/sphinx/source/internal-dev-tools.rst @@ -0,0 +1,58 @@ +################## +Internal Dev Tools +################## + +Code Probes +=========== + +Code probes are a mechanism in FDB to prove that certain code-paths are being tested under the right conditions. They differ from code coverage in multiple ways (explained below). + +The general format of a code probe is: + +.. code-block:: C++ + + CODE_PROBE(, "Comment", [annotations...]); + +A simple example of a code probe could look as follows: + +.. code-block:: C++ + + CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery", probe::context::sim2); + +On a very high level, the above code will indicate that whenever this line is executed and ``self->forceRecovery`` is ``true``, we ran into some interesting case. In addition this probe is also annotated with ``probe::context::sim2``. This indicates that we expect this code to be eventually hit in simulation. + +By default, FDB simply will write a trace-line when this code is hit and the condition is ``true``. If the code is never hit, the simulator will, at the end of the run, print the code probe but set the ``covered`` field to ``false``. This all happens in the context of a single simulation run (``fdbserver`` doesn't have a concept of ensembles). This information is written into the log file. ``TestHarness`` (see below) will then use this information to write code probe statistics to the ensemble in the Joshua cluster (if the test is run in Joshua). + +We expect that ALL code probes will be hit in a nightly run. In the future we can potentially use this feature for other things (like instructing the simulator to do an extensive search starting when one of these probes is being hit). + +In addition to ``context`` annotations, users can also define and pass assertions. For example: + +.. code-block:: C++ + + CODE_PROBE(condition, "Some comment", assert::simOnly); + +These will add an assertion to the code. In addition to that, the simulator will not print missed code probes that asserted that the probe won't be hit in simulation. + +Test Harness +============ + +TestHarness is our primary testing tool. It has multiple jobs: + +* *Running*: It can run a test in Joshua. +* *Statistics*: It will choose a test to run based on previous runs (within the same ensemble) spent CPU time for each test. It does that by writing statistics about the test at the end of each run. +* *Reporting*: After an ensemble has finished (or while it is running), ``TestHarness`` can be used to generate a report in ``xml`` or ``json``. + +Test Harness can be found in the FDB source repository under ``contrib/TestHarness2``. It has a weak dependency to `joshua `_ (if Test Harness can find joshua it will report back about failed tests, otherwise it will just print out general statistics about the ensemble). Joshua will call Test Harness as follows: + +.. code-block:: shell + + python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} + +Here the seed is a random number generated by joshua and ``OLDBINDIR`` is a directory path where the old fdb binaries can be found (this is needed for restart tests). If one wants to retry a test they can pass the previous joshua seed, a directory path that has *exactly* the same content as ``OLDBINARYDIR``, plus the reported statistics to the test harness app. This should then re-run the same code as before. + +In order to figure out what command line arguments ``test_harness.app`` (and ``test_harness.results``) accepts, one can check the contents of ``contrib/TestHarness2/test_harness/config.py``. + +Reporting +--------- + +After a joshua ensemble completed, ``test_harness.results`` can be used in order to get a report on the ensemble. This will include, by default, a list of all failed tests (similar to ``joshua tail --errors``, though in a more human readable file). For completed ensemble it will also print code probes that weren't hit often enough. An ensemble is considered to be successful if no simulation runs completed with an error AND all code probes have been hit sufficiently often. diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 2cca7fb608..bd4f1388d9 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -790,6 +790,11 @@ "disabled", "optional_experimental", "required_experimental" + ]}, + "encryption_at_rest_mode": { + "$enum":[ + "disabled", + "aes_256_ctr" ]} }, "data":{ diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst index b4c622f3f3..9dad6e05af 100644 --- a/documentation/sphinx/source/release-notes/release-notes-710.rst +++ b/documentation/sphinx/source/release-notes/release-notes-710.rst @@ -2,6 +2,32 @@ Release Notes ############# +7.1.23 +====== +* Same as 7.1.22 release with AVX enabled. + +7.1.22 +====== +* Released with AVX disabled. +* Added new latency samples for GetValue, GetRange, QueueWait, and VersionWait in storage servers. `(PR #8215) `_ +* Fixed a rare partial data write for TLogs. `(PR #8210) `_ +* Added HTTP proxy support for backup agents. `(PR #8193) `_ +* Fixed a memory bug of secondary queries in index prefetch. `(PR #8195) `_, `(PR #8190) `_ +* Introduced STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT knob to recreate SS at io_timeout errors. `(PR #8123) `_ +* Fixed two TLog stopped bugs and a CC leader replacement bug. `(PR #8081) `_ +* Added back RecoveryAvailable trace event for status's seconds_since_last_recovered field. `(PR #8068) `_ + +7.1.21 +====== +* Same as 7.1.20 release with AVX enabled. + +7.1.20 +====== +* Released with AVX disabled. +* Fixed missing localities for fdbserver that can cause cross DC calls among storage servers. `(PR #7995) `_ +* Removed extremely spammy trace event in FetchKeys and fixed transaction_profiling_analyzer.py. `(PR #7934) `_ +* Fixed bugs when GRV proxy returns an error. `(PR #7860) `_ + 7.1.19 ====== * Same as 7.1.18 release with AVX enabled. diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index aa5eede4af..75877d922b 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -27,13 +27,10 @@ Each special key that existed before api version 630 is its own module. These ar Prior to api version 630, it was also possible to read a range starting at ``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli, but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``. -Api version 630 includes two new modules: +Api version 630 includes three new modules: #. ``\xff\xff/transaction/`` - information about the current transaction #. ``\xff\xff/metrics/`` - various metrics, not transactional - -Api version 720 includes one new module: - #. ``\xff\xff/clusterId`` - returns an immutable unique ID for a cluster Transaction module @@ -279,7 +276,6 @@ Deprecated Keys Listed below are the special keys that have been deprecated. Special key(s) will no longer be accessible when the client specifies an API version equal to or larger than the version where they were deprecated. Clients specifying older API versions will be able to continue using the deprecated key(s). #. ``\xff\xff/management/profiling/`` Deprecated as of API version 720. The corresponding functionalities are now covered by the global configuration module. For details, see :doc:`global-configuration`. Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``. -#. ``\xff\xff/management/tenant_map/`` Removed as of API version 720 and renamed to ``\xff\xff/management/tenant/map/``. Versioning ========== diff --git a/documentation/sphinx/source/tenants.rst b/documentation/sphinx/source/tenants.rst index b631c55ba2..07bd7b2a42 100644 --- a/documentation/sphinx/source/tenants.rst +++ b/documentation/sphinx/source/tenants.rst @@ -2,6 +2,8 @@ Tenants ####### +.. _multi-tenancy: + .. warning :: Tenants are currently experimental and are not recommended for use in production. FoundationDB provides a feature called tenants that allow you to configure one or more named transaction domains in your cluster. A transaction domain is a key-space in which a transaction is allowed to operate, and no tenant operations are allowed to use keys outside the tenant key-space. Tenants can be useful for managing separate, unrelated use-cases and preventing them from interfering with each other. They can also be helpful for defining safe boundaries when moving a subset of data between clusters. diff --git a/documentation/sphinx/source/tls.rst b/documentation/sphinx/source/tls.rst index 3fb4a08d0c..dcb0c2c930 100644 --- a/documentation/sphinx/source/tls.rst +++ b/documentation/sphinx/source/tls.rst @@ -126,11 +126,11 @@ Default Values Certificate file default location ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The default behavior when the certificate or key file is not specified is to look for a file named ``fdb.pem`` in the current working directory. If this file is not present, an attempt is made to load a file from a system-dependent location as follows: +The default behavior when the certificate or key file is not specified is to look for files named ``cert.pem`` or ``key.pem`` respectively, in system-dependent locations as follows: -* Linux: ``/etc/foundationdb/fdb.pem`` -* macOS: ``/usr/local/etc/foundationdb/fdb.pem`` -* Windows: ``C:\ProgramData\foundationdb\fdb.pem`` +* Linux: ``/etc/foundationdb/cert.pem`` and ``/etc/foundationdb/key.pem`` +* macOS: ``/usr/local/etc/foundationdb/cert.pem`` and ``/usr/local/etc/foundationdb/key.pem`` +* Windows: ``C:\ProgramData\foundationdb\cert.pem`` and ``C:\ProgramData\foundationdb\key.pem`` Default Peer Verification ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -214,9 +214,12 @@ Certificate creation If your organization already makes use of certificates for access control and securing communications, you should ask your security expert for organizational procedure for obtaining and verifying certificates. If the goal of enabling TLS is to make sure that only known machines can join or access the FoundationDB cluster and for securing communications, then creating your own certificates can serve these purposes. -The following set of commands uses the OpenSSL command-line tools to create a self-signed certificate and private key. The certificate is then joined with the private key in the output ``fdb.pem`` file:: +The following set of commands uses the OpenSSL command-line tools to create a self-signed certificate and private key:: + + user@host:> openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout key.pem -out cert.pem + +Optionally, the certificate can be joined with the private key as supplied as both certificate and key files:: - user@host:> openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout private.key -out cert.crt user@host:> cat cert.crt private.key > fdb.pem Peer verification diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp index 9d980ff3d6..245e6d09e3 100644 --- a/documentation/tutorial/tutorial.actor.cpp +++ b/documentation/tutorial/tutorial.actor.cpp @@ -478,7 +478,7 @@ ACTOR Future fdbClient() { state Transaction tx(db); state std::string keyPrefix = "/tut/"; state Key startKey; - state KeyRef endKey = LiteralStringRef("/tut0"); + state KeyRef endKey = "/tut0"_sr; state int beginIdx = 0; loop { try { @@ -494,7 +494,7 @@ ACTOR Future fdbClient() { RangeResult range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100)); for (int i = 0; i < 10; ++i) { Key k = Key(keyPrefix + std::to_string(beginIdx + deterministicRandom()->randomInt(0, 100))); - tx.set(k, LiteralStringRef("foo")); + tx.set(k, "foo"_sr); } wait(tx.commit()); std::cout << "Committed\n"; diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 1a0f2eba14..24601308e1 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "flow/ApiVersion.h" #include "fmt/format.h" #include "fdbbackup/BackupTLSConfig.h" #include "fdbclient/JsonBuilder.h" @@ -904,12 +905,12 @@ CSimpleOpt::SOption g_rgDBPauseOptions[] = { SO_END_OF_OPTIONS }; -const KeyRef exeAgent = LiteralStringRef("backup_agent"); -const KeyRef exeBackup = LiteralStringRef("fdbbackup"); -const KeyRef exeRestore = LiteralStringRef("fdbrestore"); -const KeyRef exeFastRestoreTool = LiteralStringRef("fastrestore_tool"); // must be lower case -const KeyRef exeDatabaseAgent = LiteralStringRef("dr_agent"); -const KeyRef exeDatabaseBackup = LiteralStringRef("fdbdr"); +const KeyRef exeAgent = "backup_agent"_sr; +const KeyRef exeBackup = "fdbbackup"_sr; +const KeyRef exeRestore = "fdbrestore"_sr; +const KeyRef exeFastRestoreTool = "fastrestore_tool"_sr; // must be lower case +const KeyRef exeDatabaseAgent = "dr_agent"_sr; +const KeyRef exeDatabaseBackup = "fdbdr"_sr; extern const char* getSourceVersion(); @@ -1350,7 +1351,7 @@ ProgramExe getProgramType(std::string programExe) { } #endif // For debugging convenience, remove .debug suffix if present. - if (StringRef(programExe).endsWith(LiteralStringRef(".debug"))) + if (StringRef(programExe).endsWith(".debug"_sr)) programExe = programExe.substr(0, programExe.size() - 6); // Check if backup agent @@ -1855,11 +1856,7 @@ ACTOR Future submitDBBackup(Database src, std::string tagName) { try { state DatabaseBackupAgent backupAgent(src); - - // Backup everything, if no ranges were specified - if (backupRanges.size() == 0) { - backupRanges.push_back_deep(backupRanges.arena(), normalKeys); - } + ASSERT(!backupRanges.empty()); wait(backupAgent.submitBackup( dest, KeyRef(tagName), backupRanges, StopWhenDone::False, StringRef(), StringRef(), LockDB::True)); @@ -1905,6 +1902,7 @@ ACTOR Future submitBackup(Database db, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, Standalone> backupRanges, + bool encryptionEnabled, std::string tagName, bool dryRun, WaitForComplete waitForCompletion, @@ -1913,11 +1911,7 @@ ACTOR Future submitBackup(Database db, IncrementalBackupOnly incrementalBackupOnly) { try { state FileBackupAgent backupAgent; - - // Backup everything, if no ranges were specified - if (backupRanges.size() == 0) { - backupRanges.push_back_deep(backupRanges.arena(), normalKeys); - } + ASSERT(!backupRanges.empty()); if (dryRun) { state KeyBackedTag tag = makeBackupTag(tagName); @@ -1964,6 +1958,7 @@ ACTOR Future submitBackup(Database db, snapshotIntervalSeconds, tagName, backupRanges, + encryptionEnabled, stopWhenDone, usePartitionedLog, incrementalBackupOnly)); @@ -2017,11 +2012,7 @@ ACTOR Future switchDBBackup(Database src, ForceAction forceAction) { try { state DatabaseBackupAgent backupAgent(src); - - // Backup everything, if no ranges were specified - if (backupRanges.size() == 0) { - backupRanges.push_back_deep(backupRanges.arena(), normalKeys); - } + ASSERT(!backupRanges.empty()); wait(backupAgent.atomicSwitchover(dest, KeyRef(tagName), backupRanges, StringRef(), StringRef(), forceAction)); printf("The DR on tag `%s' was successfully switched.\n", printable(StringRef(tagName)).c_str()); @@ -2288,9 +2279,7 @@ ACTOR Future runRestore(Database db, OnlyApplyMutationLogs onlyApplyMutationLogs, InconsistentSnapshotOnly inconsistentSnapshotOnly, Optional encryptionKeyFile) { - if (ranges.empty()) { - ranges.push_back_deep(ranges.arena(), normalKeys); - } + ASSERT(!ranges.empty()); if (targetVersion != invalidVersion && !targetTimestamp.empty()) { fprintf(stderr, "Restore target version and target timestamp cannot both be specified\n"); @@ -2314,7 +2303,7 @@ ACTOR Future runRestore(Database db, throw restore_error(); } - origDb = Database::createDatabase(originalClusterFile, Database::API_VERSION_LATEST); + origDb = Database::createDatabase(originalClusterFile, ApiVersion::LATEST_VERSION); Version v = wait(timeKeeperVersionFromDatetime(targetTimestamp, origDb.get())); fmt::print("Timestamp '{0}' resolves to version {1}\n", targetTimestamp, v); targetVersion = v; @@ -2371,7 +2360,7 @@ ACTOR Future runRestore(Database db, fmt::print("Restored to version {}\n", restoredVersion); } } else { - state Optional rset = wait(bc->getRestoreSet(targetVersion, ranges)); + state Optional rset = wait(bc->getRestoreSet(targetVersion, db, ranges)); if (!rset.present()) { fmt::print(stderr, @@ -2448,8 +2437,8 @@ ACTOR Future runFastRestoreTool(Database db, dbVersion, LockDB::True, randomUID, - LiteralStringRef(""), - LiteralStringRef(""))); + ""_sr, + ""_sr)); // TODO: Support addPrefix and removePrefix if (waitForDone) { // Wait for parallel restore to finish and unlock DB after that @@ -2481,7 +2470,7 @@ ACTOR Future runFastRestoreTool(Database db, restoreVersion = dbVersion; } - state Optional rset = wait(bc->getRestoreSet(restoreVersion)); + state Optional rset = wait(bc->getRestoreSet(restoreVersion, db)); if (!rset.present()) { fmt::print(stderr, "Insufficient data to restore to version {}\n", restoreVersion); throw restore_invalid_version(); @@ -2686,7 +2675,8 @@ ACTOR Future queryBackup(const char* name, Version restoreVersion, std::string originalClusterFile, std::string restoreTimestamp, - Verbose verbose) { + Verbose verbose, + Optional cx) { state UID operationId = deterministicRandom()->randomUniqueID(); state JsonBuilderObject result; state std::string errorMessage; @@ -2720,7 +2710,7 @@ ACTOR Future queryBackup(const char* name, return Void(); } - Database origDb = Database::createDatabase(originalClusterFile, Database::API_VERSION_LATEST); + Database origDb = Database::createDatabase(originalClusterFile, ApiVersion::LATEST_VERSION); Version v = wait(timeKeeperVersionFromDatetime(restoreTimestamp, origDb)); result["restore_timestamp"] = restoreTimestamp; result["restore_timestamp_resolved_version"] = v; @@ -2751,7 +2741,7 @@ ACTOR Future queryBackup(const char* name, format("the specified restorable version %lld is not valid", restoreVersion)); return Void(); } - Optional fileSet = wait(bc->getRestoreSet(restoreVersion, keyRangesFilter)); + Optional fileSet = wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter)); if (fileSet.present()) { int64_t totalRangeFilesSize = 0, totalLogFilesSize = 0; result["restore_version"] = fileSet.get().targetVersion; @@ -3088,7 +3078,7 @@ static void addKeyRange(std::string optionValue, Standalone connectToCluster(std::string const& clusterFile, } try { - db = Database::createDatabase(ccf, -1, IsInternal::True, localities); + db = Database::createDatabase(ccf, ApiVersion::LATEST_VERSION, IsInternal::True, localities); } catch (Error& e) { if (!quiet) { fprintf(stderr, "ERROR: %s\n", e.what()); @@ -3377,6 +3367,8 @@ int main(int argc, char* argv[]) { bool trace = false; bool quietDisplay = false; bool dryRun = false; + // TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start + bool encryptionEnabled = true; std::string traceDir = ""; std::string traceFormat = ""; std::string traceLogGroup; @@ -3607,7 +3599,7 @@ int main(int argc, char* argv[]) { case OPT_DESTCONTAINER: destinationContainer = args->OptionArg(); // If the url starts with '/' then prepend "file://" for backwards compatibility - if (StringRef(destinationContainer).startsWith(LiteralStringRef("/"))) + if (StringRef(destinationContainer).startsWith("/"_sr)) destinationContainer = std::string("file://") + destinationContainer; modifyOptions.destURL = destinationContainer; break; @@ -3653,7 +3645,7 @@ int main(int argc, char* argv[]) { case OPT_RESTORECONTAINER: restoreContainer = args->OptionArg(); // If the url starts with '/' then prepend "file://" for backwards compatibility - if (StringRef(restoreContainer).startsWith(LiteralStringRef("/"))) + if (StringRef(restoreContainer).startsWith("/"_sr)) restoreContainer = std::string("file://") + restoreContainer; break; case OPT_DESCRIBE_DEEP: @@ -3944,6 +3936,12 @@ int main(int argc, char* argv[]) { return result.present(); }; + // The fastrestore tool does not yet support multiple ranges and is incompatible with tenants + // or other features that back up data in the system keys + if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) { + addDefaultBackupRanges(backupKeys); + } + switch (programExe) { case ProgramExe::AGENT: if (!initCluster()) @@ -3963,6 +3961,7 @@ int main(int argc, char* argv[]) { initialSnapshotIntervalSeconds, snapshotIntervalSeconds, backupKeys, + encryptionEnabled, tagName, dryRun, waitForDone, @@ -4083,7 +4082,8 @@ int main(int argc, char* argv[]) { restoreVersion, restoreClusterFileOrig, restoreTimestamp, - Verbose{ !quietDisplay })); + Verbose{ !quietDisplay }, + db)); break; case BackupType::DUMP: @@ -4123,7 +4123,7 @@ int main(int argc, char* argv[]) { } try { - db = Database::createDatabase(restoreClusterFileDest, Database::API_VERSION_LATEST); + db = Database::createDatabase(restoreClusterFileDest, ApiVersion::LATEST_VERSION); } catch (Error& e) { fprintf(stderr, "Restore destination cluster file '%s' invalid: %s\n", @@ -4202,7 +4202,7 @@ int main(int argc, char* argv[]) { } try { - db = Database::createDatabase(restoreClusterFileDest, Database::API_VERSION_LATEST); + db = Database::createDatabase(restoreClusterFileDest, ApiVersion::LATEST_VERSION); } catch (Error& e) { fprintf(stderr, "Restore destination cluster file '%s' invalid: %s\n", @@ -4322,19 +4322,19 @@ int main(int argc, char* argv[]) { char* demangled = abi::__cxa_demangle(i->first, NULL, NULL, NULL); if (demangled) { s = demangled; - if (StringRef(s).startsWith(LiteralStringRef("(anonymous namespace)::"))) - s = s.substr(LiteralStringRef("(anonymous namespace)::").size()); + if (StringRef(s).startsWith("(anonymous namespace)::"_sr)) + s = s.substr("(anonymous namespace)::"_sr.size()); free(demangled); } else s = i->first; #else s = i->first; - if (StringRef(s).startsWith(LiteralStringRef("class `anonymous namespace'::"))) - s = s.substr(LiteralStringRef("class `anonymous namespace'::").size()); - else if (StringRef(s).startsWith(LiteralStringRef("class "))) - s = s.substr(LiteralStringRef("class ").size()); - else if (StringRef(s).startsWith(LiteralStringRef("struct "))) - s = s.substr(LiteralStringRef("struct ").size()); + if (StringRef(s).startsWith("class `anonymous namespace'::"_sr)) + s = s.substr("class `anonymous namespace'::"_sr.size()); + else if (StringRef(s).startsWith("class "_sr)) + s = s.substr("class "_sr.size()); + else if (StringRef(s).startsWith("struct "_sr)) + s = s.substr("struct "_sr.size()); #endif typeNames.emplace_back(s, i->first); diff --git a/fdbcli/AdvanceVersionCommand.actor.cpp b/fdbcli/AdvanceVersionCommand.actor.cpp index 223af2d8e5..d3ba08d675 100644 --- a/fdbcli/AdvanceVersionCommand.actor.cpp +++ b/fdbcli/AdvanceVersionCommand.actor.cpp @@ -31,7 +31,7 @@ namespace fdb_cli { -const KeyRef advanceVersionSpecialKey = LiteralStringRef("\xff\xff/management/min_required_commit_version"); +const KeyRef advanceVersionSpecialKey = "\xff\xff/management/min_required_commit_version"_sr; ACTOR Future advanceVersionCommandActor(Reference db, std::vector tokens) { if (tokens.size() != 2) { diff --git a/fdbcli/BlobKeyCommand.actor.cpp b/fdbcli/BlobKeyCommand.actor.cpp new file mode 100644 index 0000000000..34d5b98720 --- /dev/null +++ b/fdbcli/BlobKeyCommand.actor.cpp @@ -0,0 +1,188 @@ +/* + * BlobKeyCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace { + +ACTOR Future printBlobHistory(Database db, Key key, Optional version) { + fmt::print("Printing blob history for {0}", key.printable()); + if (version.present()) { + fmt::print(" @ {0}", version.get()); + } + fmt::print("\n"); + + state Transaction tr(db); + state KeyRange activeGranule; + state KeyRange queryRange(KeyRangeRef(key, keyAfter(key))); + loop { + try { + Standalone> granules = wait(tr.getBlobGranuleRanges(queryRange, 2)); + if (granules.empty()) { + fmt::print("No active granule for {0}\n", key.printable()); + return false; + } + ASSERT(granules.size() == 1); + activeGranule = granules[0]; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + fmt::print("Active granule: [{0} - {1})\n", activeGranule.begin.printable(), activeGranule.end.printable()); + + // get latest history entry for range + state GranuleHistory history; + loop { + try { + RangeResult result = + wait(tr.getRange(blobGranuleHistoryKeyRangeFor(activeGranule), 1, Snapshot::False, Reverse::True)); + ASSERT(result.size() <= 1); + + if (result.empty()) { + fmt::print("No history entry found\n"); + return true; + } + + std::pair decodedKey = decodeBlobGranuleHistoryKey(result[0].key); + ASSERT(activeGranule == decodedKey.first); + history = GranuleHistory(activeGranule, decodedKey.second, decodeBlobGranuleHistoryValue(result[0].value)); + + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + fmt::print("History:\n\n"); + loop { + // print history + std::string boundaryChangeAction; + if (history.value.parentVersions.empty()) { + boundaryChangeAction = "root"; + } else if (history.value.parentVersions.size() == 1) { + boundaryChangeAction = "split"; + } else { + boundaryChangeAction = "merge"; + } + fmt::print("{0}) {1}\n\t{2}\n\t{3}\n({4})\n\n", + history.version, + history.value.granuleID.toString(), + history.range.begin.printable(), + history.range.end.printable(), + boundaryChangeAction); + // traverse back + + if (history.value.parentVersions.empty() || (version.present() && history.version <= version.get())) { + break; + } + + int i; + for (i = 0; i < history.value.parentBoundaries.size(); i++) { + if (history.value.parentBoundaries[i] <= key) { + break; + } + } + // key should fall between boundaries + ASSERT(i < history.value.parentBoundaries.size()); + KeyRangeRef parentRange(history.value.parentBoundaries[i], history.value.parentBoundaries[i + 1]); + Version parentVersion = history.value.parentVersions[i]; + state Key parentHistoryKey = blobGranuleHistoryKeyFor(parentRange, parentVersion); + state bool foundParent; + + loop { + try { + Optional parentHistoryValue = wait(tr.get(parentHistoryKey)); + foundParent = parentHistoryValue.present(); + if (foundParent) { + std::pair decodedKey = decodeBlobGranuleHistoryKey(parentHistoryKey); + history = GranuleHistory( + decodedKey.first, decodedKey.second, decodeBlobGranuleHistoryValue(parentHistoryValue.get())); + } + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + if (!foundParent) { + break; + } + } + + fmt::print("Done\n"); + return true; +} + +} // namespace + +namespace fdb_cli { + +ACTOR Future blobKeyCommandActor(Database localDb, + Optional tenantEntry, + std::vector tokens) { + // enables blob writing for the given range + if (tokens.size() != 3 && tokens.size() != 4) { + printUsage(tokens[0]); + return false; + } + + ASSERT(tokens[1] == "history"_sr); + + Key key; + Optional version; + + if (tenantEntry.present()) { + key = tokens[2].withPrefix(tenantEntry.get().prefix); + } else { + key = tokens[2]; + } + + if (tokens.size() > 3) { + Version v; + int n = 0; + if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[3].size()) { + printUsage(tokens[0]); + return false; + } + version = v; + } + + if (key >= "\xff"_sr) { + fmt::print("No blob history for system keyspace\n", key.printable()); + return false; + } else { + bool result = wait(printBlobHistory(localDb, key, version)); + return result; + } +} + +// can extend to other blobkey commands later +CommandFactory blobKeyFactory("blobkey", CommandHelp("blobkey history [version]", "", "")); +} // namespace fdb_cli diff --git a/fdbcli/BlobRangeCommand.actor.cpp b/fdbcli/BlobRangeCommand.actor.cpp index 4c6bdf9614..38edaa8568 100644 --- a/fdbcli/BlobRangeCommand.actor.cpp +++ b/fdbcli/BlobRangeCommand.actor.cpp @@ -112,7 +112,7 @@ ACTOR Future blobRangeCommandActor(Database localDb, end = tokens[3]; } - if (end > LiteralStringRef("\xff")) { + if (end > "\xff"_sr) { // TODO is this something we want? fmt::print("Cannot blobbify system keyspace! Problematic End Key: {0}\n", tokens[3].printable()); return false; @@ -127,19 +127,24 @@ ACTOR Future blobRangeCommandActor(Database localDb, } fmt::print("{0} blobbify range for [{1} - {2})\n", starting ? "Starting" : "Stopping", - tokens[2].printable().c_str(), - tokens[3].printable().c_str()); + tokens[2].printable(), + tokens[3].printable()); state bool success = false; if (starting) { wait(store(success, localDb->blobbifyRange(KeyRangeRef(begin, end)))); } else { wait(store(success, localDb->unblobbifyRange(KeyRangeRef(begin, end)))); } - if (!success) { + if (success) { + fmt::print("{0} updated blob range [{1} - {2}) succeeded\n", + starting ? "Starting" : "Stopping", + tokens[2].printable(), + tokens[3].printable()); + } else { fmt::print("{0} blobbify range for [{1} - {2}) failed\n", starting ? "Starting" : "Stopping", - tokens[2].printable().c_str(), - tokens[3].printable().c_str()); + tokens[2].printable(), + tokens[3].printable()); } return success; } else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) { diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt index aff53b1f63..c25c335a15 100644 --- a/fdbcli/CMakeLists.txt +++ b/fdbcli/CMakeLists.txt @@ -1,8 +1,16 @@ +include(AddFdbTest) fdb_find_sources(FDBCLI_SRCS) add_flow_target(EXECUTABLE NAME fdbcli SRCS ${FDBCLI_SRCS}) target_include_directories(fdbcli PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include") target_link_libraries(fdbcli PRIVATE fdbclient SimpleOpt) +if (USE_UBSAN) + # The intent is to put typeinfo symbols in the dynamic symbol table so that + # the types in fdbcli and external libfdb_c clients agree for ubsan's vptr + # check. This would not be a good idea for the normal build, or if we ever + # start testing old libfdb_c's that are ubsan-instrumented. + target_link_options(fdbcli PRIVATE "-rdynamic") +endif() if(NOT WIN32) target_link_libraries(fdbcli PRIVATE linenoise) @@ -16,3 +24,38 @@ if(NOT OPEN_FOR_IDE) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbcli DESTINATION bin COMPONENT clients) endif() endif() + +if (NOT WIN32 AND NOT OPEN_FOR_IDE) + add_dependencies(fdbcli external_client) + + add_fdbclient_test( + NAME single_process_fdbcli_tests + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + ) + add_fdbclient_test( + NAME multi_process_fdbcli_tests + PROCESS_NUMBER 5 + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + 5 + ) + add_fdbclient_test( + NAME single_process_external_client_fdbcli_tests + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so + ) + add_fdbclient_test( + NAME multi_process_external_client_fdbcli_tests + PROCESS_NUMBER 5 + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + 5 + --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so + ) +endif() diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp index 52521ea677..26a3da9876 100644 --- a/fdbcli/ConfigureCommand.actor.cpp +++ b/fdbcli/ConfigureCommand.actor.cpp @@ -44,20 +44,20 @@ ACTOR Future configureCommandActor(Reference db, if (tokens.size() < 2) result = ConfigurationResult::NO_OPTIONS_PROVIDED; else { - if (tokens[startToken] == LiteralStringRef("FORCE")) { + if (tokens[startToken] == "FORCE"_sr) { force = true; startToken = 2; } state Optional conf; - if (tokens[startToken] == LiteralStringRef("auto")) { + if (tokens[startToken] == "auto"_sr) { // get cluster status state Reference tr = db->createTransaction(); if (!tr->isValid()) { StatusObject _s = wait(StatusClient::statusFetcher(localDb)); s = _s; } else { - state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + state ThreadFuture> statusValueF = tr->get("\xff\xff/status/json"_sr); Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); if (!statusValue.present()) { fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); @@ -166,7 +166,7 @@ ACTOR Future configureCommandActor(Reference db, case ConfigurationResult::CONFLICTING_OPTIONS: case ConfigurationResult::UNKNOWN_OPTION: case ConfigurationResult::INCOMPLETE_CONFIGURATION: - printUsage(LiteralStringRef("configure")); + printUsage("configure"_sr); ret = false; break; case ConfigurationResult::INVALID_CONFIGURATION: @@ -259,7 +259,6 @@ ACTOR Future configureCommandActor(Reference db, fprintf(stderr, "Type `configure perpetual_storage_wiggle=1' to enable the perpetual wiggle, or `configure " "storage_migration_type=gradual' to set the gradual migration type.\n"); - ret = false; break; case ConfigurationResult::SUCCESS_WARN_ROCKSDB_EXPERIMENTAL: printf("Configuration changed\n"); @@ -276,6 +275,10 @@ ACTOR Future configureCommandActor(Reference db, fprintf(stderr, "ERROR: A cluster cannot change its tenant mode while part of a metacluster.\n"); ret = false; break; + case ConfigurationResult::ENCRYPTION_AT_REST_MODE_ALREADY_SET: + fprintf(stderr, "ERROR: A cluster cannot change its encryption_at_rest state after database creation.\n"); + ret = false; + break; default: ASSERT(false); ret = false; @@ -309,6 +312,7 @@ void configureGenerator(const char* text, "storage_migration_type=", "tenant_mode=", "blob_granules_enabled=", + "encryption_at_rest_mode=", nullptr }; arrayGenerator(text, line, opts, lc); } @@ -321,7 +325,8 @@ CommandFactory configureFactory( "commit_proxies=|grv_proxies=|logs=|resolvers=>*|" "count=|perpetual_storage_wiggle=|perpetual_storage_wiggle_locality=" "<:|0>|storage_migration_type={disabled|gradual|aggressive}" - "|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}", + "|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}" + "|encryption_at_rest_mode={disabled|aes_256_ctr}", "change the database configuration", "The `new' option, if present, initializes a new database with the given configuration rather than changing " "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be " @@ -355,6 +360,9 @@ CommandFactory configureFactory( "tenant_mode=: Sets the tenant mode for the cluster. If " "optional, then transactions can be run with or without specifying tenants. If required, all data must be " "accessed using tenants.\n\n" + "encryption_at_rest_mode=: Sets the cluster encryption data at-rest support for the " + "database. The configuration can be updated ONLY at the time of database creation and once set can't be " + "updated for the lifetime of the database.\n\n" "See the FoundationDB Administration Guide for more information."), &configureGenerator); diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp index 2e14e71fcc..1f225d1dfe 100644 --- a/fdbcli/ConsistencyCheckCommand.actor.cpp +++ b/fdbcli/ConsistencyCheckCommand.actor.cpp @@ -30,7 +30,7 @@ namespace fdb_cli { -const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended"); +const KeyRef consistencyCheckSpecialKey = "\xff\xff/management/consistency_check_suspended"_sr; ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens, diff --git a/fdbcli/ConsistencyScanCommand.actor.cpp b/fdbcli/ConsistencyScanCommand.actor.cpp new file mode 100644 index 0000000000..532e43119a --- /dev/null +++ b/fdbcli/ConsistencyScanCommand.actor.cpp @@ -0,0 +1,122 @@ +/* + * ConsistencyScanCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "fdbclient/ConsistencyScanInterface.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace fdb_cli { + +ACTOR Future consistencyScanCommandActor(Database db, std::vector tokens) { + state Reference tr = makeReference(db); + // Here we do not proceed in a try-catch loop since the transaction is always supposed to succeed. + // If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message + state int usageError = 0; + state ConsistencyScanInfo csInfo = ConsistencyScanInfo(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + // Get the exisiting consistencyScanInfo object if present + state Optional consistencyScanInfo = wait(ConsistencyScanInfo::getInfo(tr)); + wait(tr->commit()); + if (consistencyScanInfo.present()) + csInfo = ObjectReader::fromStringRef(consistencyScanInfo.get(), IncludeVersion()); + tr->reset(); + + if (tokens.size() == 1) { + printf("Consistency Scan Info: %s\n", csInfo.toString().c_str()); + } else if ((tokens.size() == 2) && tokencmp(tokens[1], "off")) { + csInfo.consistency_scan_enabled = false; + wait(ConsistencyScanInfo::setInfo(tr, csInfo)); + wait(tr->commit()); + } else if ((tokencmp(tokens[1], "on") && tokens.size() > 2)) { + csInfo.consistency_scan_enabled = true; + state std::vector::iterator t; + for (t = tokens.begin() + 2; t != tokens.end(); ++t) { + if (tokencmp(t->toString(), "restart")) { + if (++t != tokens.end()) { + if (tokencmp(t->toString(), "0")) { + csInfo.restart = false; + } else if (tokencmp(t->toString(), "1")) { + csInfo.restart = true; + } else { + usageError = 1; + } + } else { + usageError = 1; + } + } else if (tokencmp(t->toString(), "maxRate")) { + if (++t != tokens.end()) { + char* end; + csInfo.max_rate = std::strtod(t->toString().data(), &end); + if (!std::isspace(*end) && (*end != '\0')) { + fprintf(stderr, "ERROR: %s failed to parse.\n", t->toString().c_str()); + return false; + } + } else { + usageError = 1; + } + } else if (tokencmp(t->toString(), "targetInterval")) { + if (++t != tokens.end()) { + char* end; + csInfo.target_interval = std::strtod(t->toString().data(), &end); + if (!std::isspace(*end) && (*end != '\0')) { + fprintf(stderr, "ERROR: %s failed to parse.\n", t->toString().c_str()); + return false; + } + } else { + usageError = 1; + } + } else { + usageError = 1; + } + } + + if (!usageError) { + wait(ConsistencyScanInfo::setInfo(tr, csInfo)); + wait(tr->commit()); + } + } else { + usageError = 1; + } + + if (usageError) { + printUsage(tokens[0]); + return false; + } + return true; +} + +CommandFactory consistencyScanFactory( + "consistencyscan", + CommandHelp("consistencyscan ", + "enables or disables consistency scan", + "Calling this command with `on' enables the consistency scan process to run the scan with given " + "arguments and `off' will halt the scan. " + "Calling this command with no arguments will display if consistency scan is currently enabled.\n")); + +} // namespace fdb_cli \ No newline at end of file diff --git a/fdbcli/CoordinatorsCommand.actor.cpp b/fdbcli/CoordinatorsCommand.actor.cpp index b68d5ab3d3..4680c5393a 100644 --- a/fdbcli/CoordinatorsCommand.actor.cpp +++ b/fdbcli/CoordinatorsCommand.actor.cpp @@ -64,17 +64,26 @@ ACTOR Future changeCoordinators(Reference db, std::vectorstartsWith(nameTokenBegin)) { + if (tok->startsWith(nameTokenBegin) && new_cluster_description.empty()) { new_cluster_description = tok->substr(nameTokenBegin.size()); + auto next = tok - 1; std::copy(tok + 1, tokens.end(), tok); tokens.resize(tokens.size() - 1); - break; + tok = next; + } else if (tok->startsWith(noConfigDB)) { + disableConfigDB = true; + auto next = tok - 1; + std::copy(tok + 1, tokens.end(), tok); + tokens.resize(tokens.size() - 1); + tok = next; } } - state bool automatic = tokens.size() == 2 && tokens[1] == LiteralStringRef("auto"); + state bool automatic = tokens.size() == 2 && tokens[1] == "auto"_sr; state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); @@ -83,6 +92,10 @@ ACTOR Future changeCoordinators(Reference db, std::vectorset(fdb_cli::clusterDescriptionSpecialKey, new_cluster_description); } + if (disableConfigDB) { + // All that matters is the key is set. + tr->set(fdb_cli::configDBSpecialKey, ""_sr); + } // if auto change, read the special key to retrieve the recommended config if (automatic) { // if previous read failed, retry, otherwise, use the same recommened config @@ -173,9 +186,10 @@ ACTOR Future changeCoordinators(Reference db, std::vector coordinatorsCommandActor(Reference db, std::vector tokens) { if (tokens.size() < 2) { diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index 7000bdf5c7..8b7690b009 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -108,8 +108,8 @@ Future setDDIgnoreRebalanceOff(Reference db, uint8_t DDIgnoreOp namespace fdb_cli { -const KeyRef ddModeSpecialKey = LiteralStringRef("\xff\xff/management/data_distribution/mode"); -const KeyRef ddIgnoreRebalanceSpecialKey = LiteralStringRef("\xff\xff/management/data_distribution/rebalance_ignored"); +const KeyRef ddModeSpecialKey = "\xff\xff/management/data_distribution/mode"_sr; +const KeyRef ddIgnoreRebalanceSpecialKey = "\xff\xff/management/data_distribution/rebalance_ignored"_sr; constexpr auto usage = "Usage: datadistribution |enable " ">\n"; @@ -127,7 +127,7 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve printf("Data distribution is turned off.\n"); } else if (tokencmp(tokens[1], "disable")) { if (tokencmp(tokens[2], "ssfailure")) { - wait(success((setHealthyZone(db, LiteralStringRef("IgnoreSSFailures"), 0)))); + wait(success((setHealthyZone(db, "IgnoreSSFailures"_sr, 0)))); printf("Data distribution is disabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { wait(setDDIgnoreRebalanceOn(db, DDIgnore::REBALANCE_DISK | DDIgnore::REBALANCE_READ)); diff --git a/fdbcli/ExcludeCommand.actor.cpp b/fdbcli/ExcludeCommand.actor.cpp index db67bd8a6e..7c8b7217e0 100644 --- a/fdbcli/ExcludeCommand.actor.cpp +++ b/fdbcli/ExcludeCommand.actor.cpp @@ -227,22 +227,19 @@ ACTOR Future checkForCoordinators(Reference db, std::vector excludeCommandActor(Reference db, std::vector tokens, Future warn) { if (tokens.size() <= 1) { @@ -281,11 +278,11 @@ ACTOR Future excludeCommandActor(Reference db, std::vectorstartsWith(LocalityData::ExcludeLocalityPrefix) && t->toString().find(':') != std::string::npos) { diff --git a/fdbcli/FileConfigureCommand.actor.cpp b/fdbcli/FileConfigureCommand.actor.cpp index e35114c429..8cce2ec543 100644 --- a/fdbcli/FileConfigureCommand.actor.cpp +++ b/fdbcli/FileConfigureCommand.actor.cpp @@ -78,7 +78,7 @@ ACTOR Future fileConfigureCommandActor(Reference db, name + "=" + json_spirit::write_string(json_spirit::mValue(value.get_array()), json_spirit::Output_options::none); } else { - printUsage(LiteralStringRef("fileconfigure")); + printUsage("fileconfigure"_sr); return false; } } diff --git a/fdbcli/IncludeCommand.actor.cpp b/fdbcli/IncludeCommand.actor.cpp index a463772960..be55ac8476 100644 --- a/fdbcli/IncludeCommand.actor.cpp +++ b/fdbcli/IncludeCommand.actor.cpp @@ -92,8 +92,7 @@ ACTOR Future includeServers(Reference db, std::vectorclear(KeyRangeRef(addr.withSuffix(LiteralStringRef(":")), - addr.withSuffix(LiteralStringRef(";")))); + tr->clear(KeyRangeRef(addr.withSuffix(":"_sr), addr.withSuffix(";"_sr))); } } wait(safeThreadFutureToFuture(tr->commit())); @@ -112,9 +111,9 @@ ACTOR Future include(Reference db, std::vector token state bool failed = false; state bool all = false; for (auto t = tokens.begin() + 1; t != tokens.end(); ++t) { - if (*t == LiteralStringRef("all")) { + if (*t == "all"_sr) { all = true; - } else if (*t == LiteralStringRef("failed")) { + } else if (*t == "failed"_sr) { failed = true; } else if (t->startsWith(LocalityData::ExcludeLocalityPrefix) && t->toString().find(':') != std::string::npos) { // if the token starts with 'locality_' prefix. diff --git a/fdbcli/LockCommand.actor.cpp b/fdbcli/LockCommand.actor.cpp index 1ed988ee34..a2ac2c05cd 100644 --- a/fdbcli/LockCommand.actor.cpp +++ b/fdbcli/LockCommand.actor.cpp @@ -59,7 +59,7 @@ ACTOR Future lockDatabase(Reference db, UID id) { namespace fdb_cli { -const KeyRef lockSpecialKey = LiteralStringRef("\xff\xff/management/db_locked"); +const KeyRef lockSpecialKey = "\xff\xff/management/db_locked"_sr; ACTOR Future lockCommandActor(Reference db, std::vector tokens) { if (tokens.size() != 1) { diff --git a/fdbcli/MaintenanceCommand.actor.cpp b/fdbcli/MaintenanceCommand.actor.cpp index 487490e09f..b6dd8cc139 100644 --- a/fdbcli/MaintenanceCommand.actor.cpp +++ b/fdbcli/MaintenanceCommand.actor.cpp @@ -69,10 +69,10 @@ ACTOR Future printHealthyZone(Reference db) { namespace fdb_cli { -const KeyRangeRef maintenanceSpecialKeyRange = KeyRangeRef(LiteralStringRef("\xff\xff/management/maintenance/"), - LiteralStringRef("\xff\xff/management/maintenance0")); +const KeyRangeRef maintenanceSpecialKeyRange = + KeyRangeRef("\xff\xff/management/maintenance/"_sr, "\xff\xff/management/maintenance0"_sr); // The special key, if present, means data distribution is disabled for storage failures; -const KeyRef ignoreSSFailureSpecialKey = LiteralStringRef("\xff\xff/management/maintenance/IgnoreSSFailures"); +const KeyRef ignoreSSFailureSpecialKey = "\xff\xff/management/maintenance/IgnoreSSFailures"_sr; // add a zone to maintenance and specify the maintenance duration ACTOR Future setHealthyZone(Reference db, StringRef zoneId, double seconds, bool printWarning) { diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index da7c0f79fd..edb25ace2c 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -291,13 +291,7 @@ ACTOR Future metaclusterStatusCommand(Reference db, std::vector std::map clusters = wait(MetaclusterAPI::listClusters(db, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS)); - ClusterUsage totalCapacity; - ClusterUsage totalAllocated; - for (auto cluster : clusters) { - totalCapacity.numTenantGroups += - std::max(cluster.second.entry.capacity.numTenantGroups, cluster.second.entry.allocated.numTenantGroups); - totalAllocated.numTenantGroups += cluster.second.entry.allocated.numTenantGroups; - } + auto capacityNumbers = MetaclusterAPI::metaclusterCapacity(clusters); if (useJson) { json_spirit::mObject obj; @@ -305,15 +299,15 @@ ACTOR Future metaclusterStatusCommand(Reference db, std::vector json_spirit::mObject metaclusterObj; metaclusterObj["data_clusters"] = (int)clusters.size(); - metaclusterObj["capacity"] = totalCapacity.toJson(); - metaclusterObj["allocated"] = totalAllocated.toJson(); + metaclusterObj["capacity"] = capacityNumbers.first.toJson(); + metaclusterObj["allocated"] = capacityNumbers.second.toJson(); obj["metacluster"] = metaclusterObj; fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); } else { fmt::print(" number of data clusters: {}\n", clusters.size()); - fmt::print(" tenant group capacity: {}\n", totalCapacity.numTenantGroups); - fmt::print(" allocated tenant groups: {}\n", totalAllocated.numTenantGroups); + fmt::print(" tenant group capacity: {}\n", capacityNumbers.first.numTenantGroups); + fmt::print(" allocated tenant groups: {}\n", capacityNumbers.second.numTenantGroups); } return true; diff --git a/fdbcli/ProfileCommand.actor.cpp b/fdbcli/ProfileCommand.actor.cpp index c47a558258..52325d3de8 100644 --- a/fdbcli/ProfileCommand.actor.cpp +++ b/fdbcli/ProfileCommand.actor.cpp @@ -115,17 +115,13 @@ ACTOR Future profileCommandActor(Database db, return false; } // Hold the reference to the standalone's memory - state ThreadFuture kvsFuture = - tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), - LiteralStringRef("\xff\xff/worker_interfaces0")), - CLIENT_KNOBS->TOO_MANY); + state ThreadFuture kvsFuture = tr->getRange( + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr), CLIENT_KNOBS->TOO_MANY); RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); ASSERT(!kvs.more); for (const auto& pair : kvs) { - auto ip_port = - (pair.key.endsWith(LiteralStringRef(":tls")) ? pair.key.removeSuffix(LiteralStringRef(":tls")) - : pair.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); + auto ip_port = (pair.key.endsWith(":tls"_sr) ? pair.key.removeSuffix(":tls"_sr) : pair.key) + .removePrefix("\xff\xff/worker_interfaces/"_sr); printf("%s\n", printable(ip_port).c_str()); } } else { diff --git a/fdbcli/QuotaCommand.actor.cpp b/fdbcli/QuotaCommand.actor.cpp index ba8546fa15..e6a86e9b51 100644 --- a/fdbcli/QuotaCommand.actor.cpp +++ b/fdbcli/QuotaCommand.actor.cpp @@ -25,8 +25,6 @@ namespace { enum class LimitType { RESERVED, TOTAL }; -enum class OpType { READ, WRITE }; - Optional parseTag(StringRef token) { if (token.size() > CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH) { return {}; @@ -36,25 +34,15 @@ Optional parseTag(StringRef token) { } Optional parseLimitType(StringRef token) { - if (token == "reserved"_sr) { + if (token == "reserved_throughput"_sr) { return LimitType::RESERVED; - } else if (token == "total"_sr) { + } else if (token == "total_throughput"_sr) { return LimitType::TOTAL; } else { return {}; } } -Optional parseOpType(StringRef token) { - if (token == "read"_sr) { - return OpType::READ; - } else if (token == "write"_sr) { - return OpType::WRITE; - } else { - return {}; - } -} - Optional parseLimitValue(StringRef token) { try { return std::stod(token.toString()); @@ -63,7 +51,7 @@ Optional parseLimitValue(StringRef token) { } } -ACTOR Future getQuota(Reference db, TransactionTag tag, LimitType limitType, OpType opType) { +ACTOR Future getQuota(Reference db, TransactionTag tag, LimitType limitType) { state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -74,14 +62,10 @@ ACTOR Future getQuota(Reference db, TransactionTag tag, LimitTy fmt::print("\n"); } else { auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get()); - if (limitType == LimitType::TOTAL && opType == OpType::READ) { - fmt::print("{}\n", quota.totalReadQuota); - } else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) { - fmt::print("{}\n", quota.totalWriteQuota); - } else if (limitType == LimitType::RESERVED && opType == OpType::READ) { - fmt::print("{}\n", quota.reservedReadQuota); - } else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) { - fmt::print("{}\n", quota.reservedWriteQuota); + if (limitType == LimitType::TOTAL) { + fmt::print("{}\n", quota.totalQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR); + } else if (limitType == LimitType::RESERVED) { + fmt::print("{}\n", quota.reservedQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR); } } return Void(); @@ -91,11 +75,7 @@ ACTOR Future getQuota(Reference db, TransactionTag tag, LimitTy } } -ACTOR Future setQuota(Reference db, - TransactionTag tag, - LimitType limitType, - OpType opType, - double value) { +ACTOR Future setQuota(Reference db, TransactionTag tag, LimitType limitType, double value) { state Reference tr = db->createTransaction(); state Key key = tag.withPrefix(tagQuotaPrefix); loop { @@ -107,21 +87,14 @@ ACTOR Future setQuota(Reference db, if (v.present()) { quota = ThrottleApi::TagQuotaValue::fromValue(v.get()); } - if (limitType == LimitType::TOTAL && opType == OpType::READ) { - quota.totalReadQuota = value; - } else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) { - quota.totalWriteQuota = value; - } else if (limitType == LimitType::RESERVED && opType == OpType::READ) { - quota.reservedReadQuota = value; - } else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) { - quota.reservedWriteQuota = value; + // Internally, costs are stored in terms of pages, but in the API, + // costs are specified in terms of bytes + if (limitType == LimitType::TOTAL) { + quota.totalQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1; + } else if (limitType == LimitType::RESERVED) { + quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1; } - ThrottleApi::setTagQuota(tr, - tag, - quota.reservedReadQuota, - quota.totalReadQuota, - quota.reservedWriteQuota, - quota.totalWriteQuota); + ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota); wait(safeThreadFutureToFuture(tr->commit())); return Void(); } catch (Error& e) { @@ -130,8 +103,8 @@ ACTOR Future setQuota(Reference db, } } -constexpr auto usage = - "quota [get [reserved|total] [read|write]|set [reserved|total] [read|write] ]"; +constexpr auto usage = "quota [get [reserved_throughput|total_throughput] | set " + "[reserved_throughput|total_throughput] ]"; bool exitFailure() { fmt::print(usage); @@ -149,25 +122,24 @@ ACTOR Future quotaCommandActor(Reference db, std::vector setProcessClass(Reference db, KeyRef network_addre namespace fdb_cli { const KeyRangeRef processClassSourceSpecialKeyRange = - KeyRangeRef(LiteralStringRef("\xff\xff/configuration/process/class_source/"), - LiteralStringRef("\xff\xff/configuration/process/class_source0")); + KeyRangeRef("\xff\xff/configuration/process/class_source/"_sr, "\xff\xff/configuration/process/class_source0"_sr); const KeyRangeRef processClassTypeSpecialKeyRange = - KeyRangeRef(LiteralStringRef("\xff\xff/configuration/process/class_type/"), - LiteralStringRef("\xff\xff/configuration/process/class_type0")); + KeyRangeRef("\xff\xff/configuration/process/class_type/"_sr, "\xff\xff/configuration/process/class_type0"_sr); ACTOR Future setClassCommandActor(Reference db, std::vector tokens) { if (tokens.size() != 3 && tokens.size() != 1) { diff --git a/fdbcli/SnapshotCommand.actor.cpp b/fdbcli/SnapshotCommand.actor.cpp index 5bc7302f0c..7606101bba 100644 --- a/fdbcli/SnapshotCommand.actor.cpp +++ b/fdbcli/SnapshotCommand.actor.cpp @@ -40,7 +40,7 @@ ACTOR Future snapshotCommandActor(Reference db, std::vector statusCommandActor(Reference db, StatusObject _s = wait(StatusClient::statusFetcher(localDb)); s = _s; } else { - state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + state ThreadFuture> statusValueF = tr->get("\xff\xff/status/json"_sr); Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); if (!statusValue.present()) { fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp index 05b12187f3..e2be6fac56 100644 --- a/fdbcli/TenantCommands.actor.cpp +++ b/fdbcli/TenantCommands.actor.cpp @@ -36,24 +36,12 @@ namespace fdb_cli { -const KeyRangeRef tenantMapSpecialKeyRange720("\xff\xff/management/tenant/map/"_sr, - "\xff\xff/management/tenant/map0"_sr); +const KeyRangeRef tenantMapSpecialKeyRange("\xff\xff/management/tenant/map/"_sr, "\xff\xff/management/tenant/map0"_sr); const KeyRangeRef tenantConfigSpecialKeyRange("\xff\xff/management/tenant/configure/"_sr, "\xff\xff/management/tenant/configure0"_sr); const KeyRangeRef tenantRenameSpecialKeyRange("\xff\xff/management/tenant/rename/"_sr, "\xff\xff/management/tenant/rename0"_sr); -const KeyRangeRef tenantMapSpecialKeyRange710("\xff\xff/management/tenant_map/"_sr, - "\xff\xff/management/tenant_map0"_sr); - -KeyRangeRef const& tenantMapSpecialKeyRange(int apiVersion) { - if (apiVersion >= 720) { - return tenantMapSpecialKeyRange720; - } else { - return tenantMapSpecialKeyRange710; - } -} - Optional, Optional>> parseTenantConfiguration(std::vector const& tokens, int startIndex, bool allowUnset) { std::map, Optional> configParams; @@ -88,6 +76,8 @@ parseTenantConfiguration(std::vector const& tokens, int startIndex, b if (tokencmp(param, "tenant_group")) { configParams[param] = value; + } else if (tokencmp(param, "assigned_cluster")) { + configParams[param] = value; } else { fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str()); return {}; @@ -105,6 +95,10 @@ void applyConfigurationToSpecialKeys(Reference tr, TenantNameRef tenantName, std::map, Optional> configuration) { for (auto [configName, value] : configuration) { + if (configName == "assigned_cluster"_sr) { + fmt::print(stderr, "ERROR: assigned_cluster is only valid in metacluster configuration.\n"); + throw invalid_tenant_configuration(); + } if (value.present()) { tr->set(makeConfigKey(tenantName, configName), value.get()); } else { @@ -113,29 +107,28 @@ void applyConfigurationToSpecialKeys(Reference tr, } } -// createtenant command -ACTOR Future createTenantCommandActor(Reference db, std::vector tokens, int apiVersion) { - if (tokens.size() < 2 || tokens.size() > 3) { - printUsage(tokens[0]); +// tenant create command +ACTOR Future tenantCreateCommand(Reference db, std::vector tokens) { + if (tokens.size() < 3 || tokens.size() > 5) { + fmt::print("Usage: tenant create [tenant_group=] [assigned_cluster=]\n\n"); + fmt::print("Creates a new tenant in the cluster with the specified name.\n"); + fmt::print("An optional group can be specified that will require this tenant\n"); + fmt::print("to be placed on the same cluster as other tenants in the same group.\n"); + fmt::print("An optional cluster name can be specified that this tenant will be placed in.\n"); return false; } - state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]); + state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); state Reference tr = db->createTransaction(); state bool doneExistenceCheck = false; state Optional, Optional>> configuration = - parseTenantConfiguration(tokens, 2, false); + parseTenantConfiguration(tokens, 3, false); if (!configuration.present()) { return false; } - if (apiVersion < 720 && !configuration.get().empty()) { - fmt::print(stderr, "ERROR: tenants do not accept configuration options before API version 720.\n"); - return false; - } - loop { try { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); @@ -146,7 +139,7 @@ ACTOR Future createTenantCommandActor(Reference db, std::vector for (auto const& [name, value] : configuration.get()) { tenantEntry.configure(name, value); } - wait(MetaclusterAPI::createTenant(db, tokens[1], tenantEntry)); + wait(MetaclusterAPI::createTenant(db, tokens[2], tenantEntry)); } else { if (!doneExistenceCheck) { // Hold the reference to the standalone's memory @@ -159,7 +152,7 @@ ACTOR Future createTenantCommandActor(Reference db, std::vector } tr->set(tenantNameKey, ValueRef()); - applyConfigurationToSpecialKeys(tr, tokens[1], configuration.get()); + applyConfigurationToSpecialKeys(tr, tokens[2], configuration.get()); wait(safeThreadFutureToFuture(tr->commit())); } @@ -175,25 +168,20 @@ ACTOR Future createTenantCommandActor(Reference db, std::vector } } - fmt::print("The tenant `{}' has been created\n", printable(tokens[1]).c_str()); + fmt::print("The tenant `{}' has been created\n", printable(tokens[2]).c_str()); return true; } -CommandFactory createTenantFactory( - "createtenant", - CommandHelp("createtenant [tenant_group=]", - "creates a new tenant in the cluster", - "Creates a new tenant in the cluster with the specified name. An optional group can be specified" - "that will require this tenant to be placed on the same cluster as other tenants in the same group.")); - -// deletetenant command -ACTOR Future deleteTenantCommandActor(Reference db, std::vector tokens, int apiVersion) { - if (tokens.size() != 2) { - printUsage(tokens[0]); +// tenant delete command +ACTOR Future tenantDeleteCommand(Reference db, std::vector tokens) { + if (tokens.size() != 3) { + fmt::print("Usage: tenant delete \n\n"); + fmt::print("Deletes a tenant from the cluster.\n"); + fmt::print("Deletion will be allowed only if the specified tenant contains no data.\n"); return false; } - state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]); + state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); state Reference tr = db->createTransaction(); state bool doneExistenceCheck = false; @@ -203,7 +191,7 @@ ACTOR Future deleteTenantCommandActor(Reference db, std::vector tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { - wait(MetaclusterAPI::deleteTenant(db, tokens[1])); + wait(MetaclusterAPI::deleteTenant(db, tokens[2])); } else { if (!doneExistenceCheck) { // Hold the reference to the standalone's memory @@ -231,21 +219,17 @@ ACTOR Future deleteTenantCommandActor(Reference db, std::vector } } - fmt::print("The tenant `{}' has been deleted\n", printable(tokens[1]).c_str()); + fmt::print("The tenant `{}' has been deleted\n", printable(tokens[2]).c_str()); return true; } -CommandFactory deleteTenantFactory( - "deletetenant", - CommandHelp( - "deletetenant ", - "deletes a tenant from the cluster", - "Deletes a tenant from the cluster. Deletion will be allowed only if the specified tenant contains no data.")); - -// listtenants command -ACTOR Future listTenantsCommandActor(Reference db, std::vector tokens, int apiVersion) { - if (tokens.size() > 4) { - printUsage(tokens[0]); +// tenant list command +ACTOR Future tenantListCommand(Reference db, std::vector tokens) { + if (tokens.size() > 5) { + fmt::print("Usage: tenant list [BEGIN] [END] [LIMIT]\n\n"); + fmt::print("Lists the tenants in a cluster.\n"); + fmt::print("Only tenants in the range BEGIN - END will be printed.\n"); + fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n"); return false; } @@ -253,33 +237,33 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< state StringRef endTenant = "\xff\xff"_sr; state int limit = 100; - if (tokens.size() >= 2) { - beginTenant = tokens[1]; - } if (tokens.size() >= 3) { - endTenant = tokens[2]; + beginTenant = tokens[2]; + } + if (tokens.size() >= 4) { + endTenant = tokens[3]; if (endTenant <= beginTenant) { fmt::print(stderr, "ERROR: end must be larger than begin"); return false; } } - if (tokens.size() == 4) { + if (tokens.size() == 5) { int n = 0; - if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size() || limit <= 0) { - fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[3].toString().c_str()); + if (sscanf(tokens[4].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[4].size() || limit <= 0) { + fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[4].toString().c_str()); return false; } } - state Key beginTenantKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(beginTenant); - state Key endTenantKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(endTenant); + state Key beginTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(beginTenant); + state Key endTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(endTenant); state Reference tr = db->createTransaction(); loop { try { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); - state std::vector tenantNames; + state std::vector tenantNames; if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { std::vector> tenants = wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit)); @@ -292,12 +276,12 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit); RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture)); for (auto tenant : tenants) { - tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin)); + tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange.begin)); } } if (tenantNames.empty()) { - if (tokens.size() == 1) { + if (tokens.size() == 2) { fmt::print("The cluster has no tenants\n"); } else { fmt::print("The cluster has no tenants in the specified range\n"); @@ -322,22 +306,17 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< } } -CommandFactory listTenantsFactory( - "listtenants", - CommandHelp("listtenants [BEGIN] [END] [LIMIT]", - "print a list of tenants in the cluster", - "Print a list of tenants in the cluster. Only tenants in the range [BEGIN] - [END] will be printed. " - "The number of tenants to print can be specified using the [LIMIT] parameter, which defaults to 100.")); - -// gettenant command -ACTOR Future getTenantCommandActor(Reference db, std::vector tokens, int apiVersion) { - if (tokens.size() < 2 || tokens.size() > 3 || (tokens.size() == 3 && tokens[2] != "JSON"_sr)) { - printUsage(tokens[0]); +// tenant get command +ACTOR Future tenantGetCommand(Reference db, std::vector tokens) { + if (tokens.size() < 3 || tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) { + fmt::print("Usage: tenant get [JSON]\n\n"); + fmt::print("Prints metadata associated with the given tenant.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); return false; } - state bool useJson = tokens.size() == 3; - state Key tenantNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]); + state bool useJson = tokens.size() == 4; + state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); state Reference tr = db->createTransaction(); loop { @@ -346,8 +325,8 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector> tenantFuture = tr->get(tenantNameKey); @@ -378,12 +357,7 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector= 720) { - doc.get("prefix.printable", prefix); - } else { - doc.get("prefix", prefix); - } + doc.get("prefix.printable", prefix); doc.get("tenant_state", tenantState); bool hasTenantGroup = doc.tryGet("tenant_group.printable", tenantGroup); @@ -431,21 +405,19 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector [JSON]", - "prints the metadata for a tenant", - "Prints the metadata for a tenant. If JSON is specified, then the output will be in JSON format.")); - -// configuretenant command -ACTOR Future configureTenantCommandActor(Reference db, std::vector tokens) { - if (tokens.size() < 3) { - printUsage(tokens[0]); +// tenant configure command +ACTOR Future tenantConfigureCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4) { + fmt::print("Usage: tenant configure <[unset] tenant_group[=]> ...\n\n"); + fmt::print("Updates the configuration for a tenant.\n"); + fmt::print("Use `tenant_group=' to change the tenant group that a\n"); + fmt::print("tenant is assigned to or `unset tenant_group' to remove a tenant from\n"); + fmt::print("its tenant group."); return false; } state Optional, Optional>> configuration = - parseTenantConfiguration(tokens, 2, true); + parseTenantConfiguration(tokens, 3, true); if (!configuration.present()) { return false; @@ -460,9 +432,9 @@ ACTOR Future configureTenantCommandActor(Reference db, std::vec ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { TenantMapEntry tenantEntry; - wait(MetaclusterAPI::configureTenant(db, tokens[1], configuration.get())); + wait(MetaclusterAPI::configureTenant(db, tokens[2], configuration.get())); } else { - applyConfigurationToSpecialKeys(tr, tokens[1], configuration.get()); + applyConfigurationToSpecialKeys(tr, tokens[2], configuration.get()); wait(safeThreadFutureToFuture(tr->commit())); } break; @@ -477,17 +449,10 @@ ACTOR Future configureTenantCommandActor(Reference db, std::vec } } - fmt::print("The configuration for tenant `{}' has been updated\n", printable(tokens[1]).c_str()); + fmt::print("The configuration for tenant `{}' has been updated\n", printable(tokens[2]).c_str()); return true; } -CommandFactory configureTenantFactory( - "configuretenant", - CommandHelp("configuretenant <[unset] tenant_group[=]> ...", - "updates the configuration for a tenant", - "Updates the configuration for a tenant. Use `tenant_group=' to change the tenant group " - "that a tenant is assigned to or `unset tenant_group' to remove a tenant from its tenant group.")); - // Helper function to extract tenant ID from json metadata string int64_t getTenantId(Value metadata) { json_spirit::mValue jsonObject; @@ -498,61 +463,69 @@ int64_t getTenantId(Value metadata) { return id; } -// renametenant command -ACTOR Future renameTenantCommandActor(Reference db, std::vector tokens, int apiVersion) { - if (tokens.size() != 3) { - printUsage(tokens[0]); +// tenant rename command +ACTOR Future tenantRenameCommand(Reference db, std::vector tokens) { + if (tokens.size() != 4) { + fmt::print("Usage: tenant rename \n\n"); + fmt::print("Renames a tenant in the cluster. The old name must exist and the new\n"); + fmt::print("name must not exist in the cluster.\n"); return false; } state Reference tr = db->createTransaction(); - state Key tenantRenameKey = tenantRenameSpecialKeyRange.begin.withSuffix(tokens[1]); - state Key tenantOldNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]); - state Key tenantNewNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[2]); + state Key tenantRenameKey = tenantRenameSpecialKeyRange.begin.withSuffix(tokens[2]); + state Key tenantOldNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); + state Key tenantNewNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[3]); state bool firstTry = true; - state int64_t id; + state int64_t id = -1; loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - // Hold the reference to the standalone's memory - state ThreadFuture> oldEntryFuture = tr->get(tenantOldNameKey); - state ThreadFuture> newEntryFuture = tr->get(tenantNewNameKey); - state Optional oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture)); - state Optional newEntry = wait(safeThreadFutureToFuture(newEntryFuture)); - if (firstTry) { - if (!oldEntry.present()) { - throw tenant_not_found(); - } - if (newEntry.present()) { - throw tenant_already_exists(); - } - // Store the id we see when first reading this key - id = getTenantId(oldEntry.get()); - - firstTry = false; + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(MetaclusterAPI::renameTenant(db, tokens[2], tokens[3])); } else { - // If we got commit_unknown_result, the rename may have already occurred. - if (newEntry.present()) { - int64_t checkId = getTenantId(newEntry.get()); - if (id == checkId) { - ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id); - return true; + // Hold the reference to the standalone's memory + state ThreadFuture> oldEntryFuture = tr->get(tenantOldNameKey); + state ThreadFuture> newEntryFuture = tr->get(tenantNewNameKey); + state Optional oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture)); + state Optional newEntry = wait(safeThreadFutureToFuture(newEntryFuture)); + if (firstTry) { + if (!oldEntry.present()) { + throw tenant_not_found(); + } + if (newEntry.present()) { + throw tenant_already_exists(); + } + // Store the id we see when first reading this key + id = getTenantId(oldEntry.get()); + + firstTry = false; + } else { + // If we got commit_unknown_result, the rename may have already occurred. + if (newEntry.present()) { + int64_t checkId = getTenantId(newEntry.get()); + if (id == checkId) { + ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id); + return true; + } + // If the new entry is present but does not match, then + // the rename should fail, so we throw an error. + throw tenant_already_exists(); + } + if (!oldEntry.present()) { + throw tenant_not_found(); + } + int64_t checkId = getTenantId(oldEntry.get()); + // If the id has changed since we made our first attempt, + // then it's possible we've already moved the tenant. Don't move it again. + if (id != checkId) { + throw tenant_not_found(); } - // If the new entry is present but does not match, then - // the rename should fail, so we throw an error. - throw tenant_already_exists(); - } - if (!oldEntry.present()) { - throw tenant_not_found(); - } - int64_t checkId = getTenantId(oldEntry.get()); - // If the id has changed since we made our first attempt, - // then it's possible we've already moved the tenant. Don't move it again. - if (id != checkId) { - throw tenant_not_found(); } + tr->set(tenantRenameKey, tokens[3]); + wait(safeThreadFutureToFuture(tr->commit())); } - tr->set(tenantRenameKey, tokens[2]); - wait(safeThreadFutureToFuture(tr->commit())); break; } catch (Error& e) { state Error err(e); @@ -566,14 +539,120 @@ ACTOR Future renameTenantCommandActor(Reference db, std::vector } fmt::print( - "The tenant `{}' has been renamed to `{}'\n", printable(tokens[1]).c_str(), printable(tokens[2]).c_str()); + "The tenant `{}' has been renamed to `{}'\n", printable(tokens[2]).c_str(), printable(tokens[3]).c_str()); return true; } -CommandFactory renameTenantFactory( - "renametenant", - CommandHelp( - "renametenant ", - "renames a tenant in the cluster", - "Renames a tenant in the cluster. The old name must exist and the new name must not exist in the cluster.")); +// tenant command +Future tenantCommand(Reference db, std::vector tokens) { + if (tokens.size() == 1) { + printUsage(tokens[0]); + return true; + } else if (tokencmp(tokens[1], "create")) { + return tenantCreateCommand(db, tokens); + } else if (tokencmp(tokens[1], "delete")) { + return tenantDeleteCommand(db, tokens); + } else if (tokencmp(tokens[1], "list")) { + return tenantListCommand(db, tokens); + } else if (tokencmp(tokens[1], "get")) { + return tenantGetCommand(db, tokens); + } else if (tokencmp(tokens[1], "configure")) { + return tenantConfigureCommand(db, tokens); + } else if (tokencmp(tokens[1], "rename")) { + return tenantRenameCommand(db, tokens); + } else { + printUsage(tokens[0]); + return true; + } +} + +Future tenantCommandForwarder(Reference db, std::vector tokens) { + ASSERT(!tokens.empty() && (tokens[0].endsWith("tenant"_sr) || tokens[0].endsWith("tenants"_sr))); + std::vector forwardedTokens = { "tenant"_sr, + tokens[0].endsWith("tenant"_sr) ? tokens[0].removeSuffix("tenant"_sr) + : tokens[0].removeSuffix("tenants"_sr) }; + for (int i = 1; i < tokens.size(); ++i) { + forwardedTokens.push_back(tokens[i]); + } + + return tenantCommand(db, forwardedTokens); +} // namespace fdb_cli + +void tenantGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { "create", "delete", "list", "get", "configure", "rename", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3 && tokencmp(tokens[1], "create")) { + const char* opts[] = { "tenant_group=", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3 && tokencmp(tokens[1], "get")) { + const char* opts[] = { "JSON", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokencmp(tokens[1], "configure")) { + if (tokens.size() == 3) { + const char* opts[] = { "tenant_group=", "unset", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 4 && tokencmp(tokens[3], "unset")) { + const char* opts[] = { "tenant_group", nullptr }; + arrayGenerator(text, line, opts, lc); + } + } +} + +std::vector tenantHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "create") && tokens.size() < 5) { + static std::vector opts = { "", + "[tenant_group=]", + "[assigned_cluster=]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "delete") && tokens.size() < 3) { + static std::vector opts = { "" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "list") && tokens.size() < 5) { + static std::vector opts = { "[BEGIN]", "[END]", "[LIMIT]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "get") && tokens.size() < 4) { + static std::vector opts = { "", "[JSON]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "configure")) { + if (tokens.size() < 4) { + static std::vector opts = { "", "<[unset] tenant_group[=]>" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokens.size() == 4 && tokencmp(tokens[3], "unset")) { + static std::vector opts = { "]>" }; + return std::vector(opts.begin() + tokens.size() - 4, opts.end()); + } + return {}; + } else if (tokencmp(tokens[1], "rename") && tokens.size() < 4) { + static std::vector opts = { "", "" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else { + return {}; + } +} + +CommandFactory tenantRegisterFactory("tenant", + CommandHelp("tenant [ARGS]", + "view and manage tenants in a cluster or metacluster", + "`create' and `delete' add and remove tenants from the cluster.\n" + "`list' prints a list of tenants in the cluster.\n" + "`get' prints the metadata for a particular tenant.\n" + "`configure' modifies the configuration for a tenant.\n" + "`rename' changes the name of a tenant.\n"), + &tenantGenerator, + &tenantHintGenerator); + +// Generate hidden commands for the old versions of the tenant commands +CommandFactory createTenantFactory("createtenant"); +CommandFactory deleteTenantFactory("deletetenant"); +CommandFactory listTenantsFactory("listtenants"); +CommandFactory getTenantFactory("gettenant"); +CommandFactory configureTenantFactory("configuretenant"); +CommandFactory renameTenantFactory("renametenant"); + } // namespace fdb_cli diff --git a/fdbcli/TenantGroupCommands.actor.cpp b/fdbcli/TenantGroupCommands.actor.cpp new file mode 100644 index 0000000000..6a89360aeb --- /dev/null +++ b/fdbcli/TenantGroupCommands.actor.cpp @@ -0,0 +1,240 @@ +/* + * TenantGroupCommands.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/GenericManagementAPI.actor.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/Schemas.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace fdb_cli { + +// tenantgroup list command +ACTOR Future tenantGroupListCommand(Reference db, std::vector tokens) { + if (tokens.size() > 5) { + fmt::print("Usage: tenantgroup list [BEGIN] [END] [LIMIT]\n\n"); + fmt::print("Lists the tenant groups in a cluster.\n"); + fmt::print("Only tenant groups in the range BEGIN - END will be printed.\n"); + fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n"); + return false; + } + + state StringRef beginTenantGroup = ""_sr; + state StringRef endTenantGroup = "\xff\xff"_sr; + state int limit = 100; + + if (tokens.size() >= 3) { + beginTenantGroup = tokens[2]; + } + if (tokens.size() >= 4) { + endTenantGroup = tokens[3]; + if (endTenantGroup <= beginTenantGroup) { + fmt::print(stderr, "ERROR: end must be larger than begin"); + return false; + } + } + if (tokens.size() == 5) { + int n = 0; + if (sscanf(tokens[4].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[4].size() || limit <= 0) { + fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[4].toString()); + return false; + } + } + + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::vector tenantGroupNames; + state std::vector> tenantGroups; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(store(tenantGroups, + MetaclusterAPI::listTenantGroupsTransaction(tr, beginTenantGroup, endTenantGroup, limit))); + } else { + wait(store(tenantGroups, + TenantAPI::listTenantGroupsTransaction(tr, beginTenantGroup, endTenantGroup, limit))); + } + + if (tenantGroups.empty()) { + if (tokens.size() == 2) { + fmt::print("The cluster has no tenant groups\n"); + } else { + fmt::print("The cluster has no tenant groups in the specified range\n"); + } + } + + int index = 0; + for (auto tenantGroup : tenantGroups) { + fmt::print(" {}. {}\n", ++index, printable(tenantGroup.first)); + } + + return true; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +// tenantgroup get command +ACTOR Future tenantGroupGetCommand(Reference db, std::vector tokens) { + if (tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) { + fmt::print("Usage: tenantgroup get [JSON]\n\n"); + fmt::print("Prints metadata associated with the given tenant group.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); + return false; + } + + state bool useJson = tokens.size() == 4; + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::string tenantJson; + state Optional entry; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(store(entry, MetaclusterAPI::tryGetTenantGroupTransaction(tr, tokens[2]))); + } else { + wait(store(entry, TenantAPI::tryGetTenantGroupTransaction(tr, tokens[2]))); + Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + // We don't store assigned clusters in the tenant group entry on data clusters, so we can instead + // populate it from the metacluster registration + if (entry.present() && metaclusterRegistration.present() && + metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA && + !entry.get().assignedCluster.present()) { + entry.get().assignedCluster = metaclusterRegistration.get().name; + } + } + + if (!entry.present()) { + throw tenant_not_found(); + } + + if (useJson) { + json_spirit::mObject resultObj; + resultObj["tenant_group"] = entry.get().toJson(); + resultObj["type"] = "success"; + fmt::print("{}\n", + json_spirit::write_string(json_spirit::mValue(resultObj), json_spirit::pretty_print)); + } else { + if (entry.get().assignedCluster.present()) { + fmt::print(" assigned cluster: {}\n", printable(entry.get().assignedCluster)); + } else { + // This is a placeholder output for when a tenant group is read in a non-metacluster, where + // it currently has no metadata. When metadata is eventually added, we can print that instead. + fmt::print("The tenant group is present in the cluster\n"); + } + } + return true; + } catch (Error& e) { + try { + wait(safeThreadFutureToFuture(tr->onError(e))); + } catch (Error& finalErr) { + state std::string errorStr; + if (finalErr.code() == error_code_tenant_not_found) { + errorStr = "tenant group not found"; + } else if (useJson) { + errorStr = finalErr.what(); + } else { + throw finalErr; + } + + if (useJson) { + json_spirit::mObject resultObj; + resultObj["type"] = "error"; + resultObj["error"] = errorStr; + fmt::print("{}\n", + json_spirit::write_string(json_spirit::mValue(resultObj), json_spirit::pretty_print)); + } else { + fmt::print(stderr, "ERROR: {}\n", errorStr); + } + + return false; + } + } + } +} + +// tenantgroup command +Future tenantGroupCommand(Reference db, std::vector tokens) { + if (tokens.size() == 1) { + printUsage(tokens[0]); + return true; + } else if (tokencmp(tokens[1], "list")) { + return tenantGroupListCommand(db, tokens); + } else if (tokencmp(tokens[1], "get")) { + return tenantGroupGetCommand(db, tokens); + } else { + printUsage(tokens[0]); + return true; + } +} + +void tenantGroupGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { "list", "get", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3 && tokencmp(tokens[1], "get")) { + const char* opts[] = { "JSON", nullptr }; + arrayGenerator(text, line, opts, lc); + } +} + +std::vector tenantGroupHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "list") && tokens.size() < 5) { + static std::vector opts = { "[BEGIN]", "[END]", "[LIMIT]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "get") && tokens.size() < 4) { + static std::vector opts = { "", "[JSON]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else { + return {}; + } +} + +CommandFactory tenantGroupRegisterFactory("tenantgroup", + CommandHelp("tenantgroup [ARGS]", + "view tenant group information", + "`list' prints a list of tenant groups in the cluster.\n" + "`get' prints the metadata for a particular tenant group.\n"), + &tenantGroupGenerator, + &tenantGroupHintGenerator); + +} // namespace fdb_cli diff --git a/fdbcli/ThrottleCommand.actor.cpp b/fdbcli/ThrottleCommand.actor.cpp index abff0e0475..057a83c78f 100644 --- a/fdbcli/ThrottleCommand.actor.cpp +++ b/fdbcli/ThrottleCommand.actor.cpp @@ -163,11 +163,11 @@ ACTOR Future throttleCommandActor(Reference db, std::vector tssQuarantine(Reference db, bool enable, UID tssId } if (enable) { - tr->set(tssQuarantineKeyFor(tssId), LiteralStringRef("")); + tr->set(tssQuarantineKeyFor(tssId), ""_sr); // remove server from TSS mapping when quarantine is enabled tssMapDB.erase(tr, ssi.tssPairID.get()); } else { @@ -112,19 +112,19 @@ namespace fdb_cli { ACTOR Future tssqCommandActor(Reference db, std::vector tokens) { if (tokens.size() == 2) { - if (tokens[1] != LiteralStringRef("list")) { + if (tokens[1] != "list"_sr) { printUsage(tokens[0]); return false; } else { wait(tssQuarantineList(db)); } } else if (tokens.size() == 3) { - if ((tokens[1] != LiteralStringRef("start") && tokens[1] != LiteralStringRef("stop")) || - (tokens[2].size() != 32) || !std::all_of(tokens[2].begin(), tokens[2].end(), &isxdigit)) { + if ((tokens[1] != "start"_sr && tokens[1] != "stop"_sr) || (tokens[2].size() != 32) || + !std::all_of(tokens[2].begin(), tokens[2].end(), &isxdigit)) { printUsage(tokens[0]); return false; } else { - bool enable = tokens[1] == LiteralStringRef("start"); + bool enable = tokens[1] == "start"_sr; UID tssId = UID::fromString(tokens[2].toString()); bool success = wait(tssQuarantine(db, enable, tssId)); return success; diff --git a/fdbcli/Util.actor.cpp b/fdbcli/Util.actor.cpp index 2d0e77d9fe..aed1133047 100644 --- a/fdbcli/Util.actor.cpp +++ b/fdbcli/Util.actor.cpp @@ -74,17 +74,15 @@ void addInterfacesFromKVs(RangeResult& kvs, return; } ClientLeaderRegInterface leaderInterf(workerInterf.address()); - StringRef ip_port = - (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); + StringRef ip_port = (kv.key.endsWith(":tls"_sr) ? kv.key.removeSuffix(":tls"_sr) : kv.key) + .removePrefix("\xff\xff/worker_interfaces/"_sr); (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { Key full_ip_port2 = StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) - ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) - : full_ip_port2; + StringRef ip_port2 = + full_ip_port2.endsWith(":tls"_sr) ? full_ip_port2.removeSuffix(":tls"_sr) : full_ip_port2; (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); } } @@ -99,8 +97,7 @@ ACTOR Future getWorkerInterfaces(Reference tr, } // Hold the reference to the standalone's memory state ThreadFuture kvsFuture = tr->getRange( - KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")), - CLIENT_KNOBS->TOO_MANY); + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr), CLIENT_KNOBS->TOO_MANY); state RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); ASSERT(!kvs.more); if (verify) { diff --git a/fdbcli/VersionEpochCommand.actor.cpp b/fdbcli/VersionEpochCommand.actor.cpp index 7d073e590d..a9dcd7e198 100644 --- a/fdbcli/VersionEpochCommand.actor.cpp +++ b/fdbcli/VersionEpochCommand.actor.cpp @@ -32,7 +32,7 @@ namespace fdb_cli { -const KeyRef versionEpochSpecialKey = LiteralStringRef("\xff\xff/management/version_epoch"); +const KeyRef versionEpochSpecialKey = "\xff\xff/management/version_epoch"_sr; struct VersionInfo { int64_t version; diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index b10ed32a20..a5c2e2e75a 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -44,6 +44,7 @@ #include "fdbclient/ThreadSafeTransaction.h" #include "flow/flow.h" +#include "flow/ApiVersion.h" #include "flow/ArgParseUtil.h" #include "flow/DeterministicRandom.h" #include "flow/FastRef.h" @@ -73,7 +74,6 @@ #include "flow/actorcompiler.h" // This must be the last #include. -#define FDB_API_VERSION 720 /* * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi * instance (e.g. from ThreadSafeApi) @@ -537,10 +537,10 @@ void initHelp() { CommandHelp("getversion", "Fetch the current read version", "Displays the current read version of the database or currently running transaction."); - helpMap["quota"] = - CommandHelp("quota", - "quota [get [reserved|total] [read|write]|set [reserved|total] [read|write] ]", - "Get or modify the throughput quota for the specified tag."); + helpMap["quota"] = CommandHelp("quota", + "quota [get [reserved_throughput|total_throughput] | set " + "[reserved_throughput|total_throughput] ]", + "Get or modify the throughput quota for the specified tag."); helpMap["reset"] = CommandHelp("reset", "reset the current transaction", @@ -654,7 +654,7 @@ ACTOR Future checkStatus(Future f, StatusObject _s = wait(StatusClient::statusFetcher(localDb)); s = _s; } else { - state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + state ThreadFuture> statusValueF = tr->get("\xff\xff/status/json"_sr); Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); if (!statusValue.present()) { fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); @@ -698,7 +698,7 @@ ACTOR Future createSnapshot(Database db, std::vector tokens) { for (int i = 1; i < tokens.size(); i++) { snapCmd = snapCmd.withSuffix(tokens[i]); if (i != tokens.size() - 1) { - snapCmd = snapCmd.withSuffix(LiteralStringRef(" ")); + snapCmd = snapCmd.withSuffix(" "_sr); } } try { @@ -889,7 +889,7 @@ struct CLIOptions { std::vector> knobs; // api version, using the latest version by default - int apiVersion = FDB_API_VERSION; + int apiVersion = ApiVersion::LATEST_VERSION; CLIOptions(int argc, char* argv[]) { program_name = argv[0]; @@ -938,12 +938,12 @@ struct CLIOptions { if (*endptr != '\0') { fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg()); return 1; - } else if (apiVersion < 700 || apiVersion > FDB_API_VERSION) { + } else if (apiVersion < 700 || apiVersion > ApiVersion::LATEST_VERSION) { // multi-version fdbcli only available after 7.0 fprintf(stderr, "ERROR: api version %s is not supported. (Min: 700, Max: %d)\n", args.OptionArg(), - FDB_API_VERSION); + ApiVersion::LATEST_VERSION); return 1; } break; @@ -1050,7 +1050,7 @@ Future stopNetworkAfter(Future what) { } } -ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { +ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise, Reference ccf) { state LineNoise& linenoise = *plinenoise; state bool intrans = false; @@ -1075,20 +1075,6 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { state FdbOptions* options = &globalOptions; - state Reference ccf; - - state std::pair resolvedClusterFile = - ClusterConnectionFile::lookupClusterFileName(opt.clusterFile); - try { - ccf = makeReference(resolvedClusterFile.first); - } catch (Error& e) { - if (e.code() == error_code_operation_cancelled) { - throw; - } - fprintf(stderr, "%s\n", ClusterConnectionFile::getErrorString(resolvedClusterFile, e).c_str()); - return 1; - } - // Ordinarily, this is done when the network is run. However, network thread should be set before TraceEvents are // logged. This thread will eventually run the network, so call it now. TraceEvent::setNetworkThread(); @@ -1342,13 +1328,10 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "fileconfigure")) { - if (tokens.size() == 2 || (tokens.size() == 3 && (tokens[1] == LiteralStringRef("new") || - tokens[1] == LiteralStringRef("FORCE")))) { - bool _result = - wait(makeInterruptable(fileConfigureCommandActor(db, - tokens.back().toString(), - tokens[1] == LiteralStringRef("new"), - tokens[1] == LiteralStringRef("FORCE")))); + if (tokens.size() == 2 || + (tokens.size() == 3 && (tokens[1] == "new"_sr || tokens[1] == "FORCE"_sr))) { + bool _result = wait(makeInterruptable(fileConfigureCommandActor( + db, tokens.back().toString(), tokens[1] == "new"_sr, tokens[1] == "FORCE"_sr))); if (!_result) is_error = true; } else { @@ -1407,6 +1390,13 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "blobkey")) { + bool _result = wait(makeInterruptable(blobKeyCommandActor(localDb, tenantEntry, tokens))); + if (!_result) + is_error = true; + continue; + } + if (tokencmp(tokens[0], "unlock")) { if ((tokens.size() != 2) || (tokens[1].size() != 32) || !std::all_of(tokens[1].begin(), tokens[1].end(), &isxdigit)) { @@ -1596,6 +1586,13 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "consistencyscan")) { + bool _result = wait(makeInterruptable(consistencyScanCommandActor(localDb, tokens))); + if (!_result) + is_error = true; + continue; + } + if (tokencmp(tokens[0], "profile")) { getTransaction(db, managementTenant, tr, options, intrans); bool _result = wait(makeInterruptable(profileCommandActor(localDb, tr, tokens, intrans))); @@ -1888,60 +1885,32 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } - if (tokencmp(tokens[0], "createtenant")) { - bool _result = wait(makeInterruptable(createTenantCommandActor(db, tokens, opt.apiVersion))); - if (!_result) + if (tokencmp(tokens[0], "tenant")) { + bool _result = wait(makeInterruptable(tenantCommand(db, tokens))); + if (!_result) { is_error = true; - continue; - } - - if (tokencmp(tokens[0], "deletetenant")) { - bool _result = wait(makeInterruptable(deleteTenantCommandActor(db, tokens, opt.apiVersion))); - if (!_result) - is_error = true; - else if (tenantName.present() && tokens[1] == tenantName.get()) { + } else if (tokens.size() >= 3 && tenantName.present() && tokencmp(tokens[1], "delete") && + tokens[2] == tenantName.get()) { printAtCol("WARNING: the active tenant was deleted. Use the `usetenant' or `defaulttenant' " "command to choose a new tenant.\n", 80); } + continue; } - if (tokencmp(tokens[0], "listtenants")) { - bool _result = wait(makeInterruptable(listTenantsCommandActor(db, tokens, opt.apiVersion))); - if (!_result) + if (tokencmp(tokens[0], "createtenant") || tokencmp(tokens[0], "deletetenant") || + tokencmp(tokens[0], "listtenants") || tokencmp(tokens[0], "gettenant") || + tokencmp(tokens[0], "configuretenant") || tokencmp(tokens[0], "renametenant")) { + bool _result = wait(makeInterruptable(tenantCommandForwarder(db, tokens))); + if (!_result) { is_error = true; - continue; - } - - if (tokencmp(tokens[0], "gettenant")) { - bool _result = wait(makeInterruptable(getTenantCommandActor(db, tokens, opt.apiVersion))); - if (!_result) - is_error = true; - continue; - } - - if (tokencmp(tokens[0], "configuretenant")) { - if (opt.apiVersion < 720) { - fmt::print(stderr, "ERROR: tenants cannot be configured before API version 720.\n"); - is_error = true; - continue; } - - bool _result = wait(makeInterruptable(configureTenantCommandActor(db, tokens))); - if (!_result) - is_error = true; continue; } - if (tokencmp(tokens[0], "renametenant")) { - if (opt.apiVersion < 720) { - fmt::print(stderr, "ERROR: tenants cannot be renamed before API version 720.\n"); - is_error = true; - continue; - } - - bool _result = wait(makeInterruptable(renameTenantCommandActor(db, tokens, opt.apiVersion))); + if (tokencmp(tokens[0], "tenantgroup")) { + bool _result = wait(makeInterruptable(tenantGroupCommand(db, tokens))); if (!_result) is_error = true; continue; @@ -1987,7 +1956,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } } -ACTOR Future runCli(CLIOptions opt) { +ACTOR Future runCli(CLIOptions opt, Reference ccf) { state LineNoise linenoise( [](std::string const& line, std::vector& completions) { fdbcliCompCmd(line, completions); }, [enabled = opt.cliHints](std::string const& line) -> LineNoise::Hint { @@ -2051,7 +2020,7 @@ ACTOR Future runCli(CLIOptions opt) { .GetLastError(); } - state int result = wait(cli(opt, &linenoise)); + state int result = wait(cli(opt, &linenoise, ccf)); if (!historyFilename.empty()) { try { @@ -2073,6 +2042,31 @@ ACTOR Future timeExit(double duration) { return Void(); } +const char* checkTlsConfigAgainstCoordAddrs(const ClusterConnectionString& ccs) { + // Resolve TLS config and inspect whether any of the certificate, key, ca bytes has been set + extern TLSConfig tlsConfig; + auto const loaded = tlsConfig.loadSync(); + const bool tlsConfigured = + !loaded.getCertificateBytes().empty() || !loaded.getKeyBytes().empty() || !loaded.getCABytes().empty(); + int tlsAddrs = 0; + int totalAddrs = 0; + for (const auto& addr : ccs.coords) { + if (addr.isTLS()) + tlsAddrs++; + totalAddrs++; + } + for (const auto& host : ccs.hostnames) { + if (host.isTLS) + tlsAddrs++; + totalAddrs++; + } + if (!tlsConfigured && tlsAddrs == totalAddrs) { + return "fdbcli is not configured with TLS, but all of the coordinators have TLS addresses."; + } else { + return nullptr; + } +} + int main(int argc, char** argv) { platformInit(); Error::init(); @@ -2177,6 +2171,25 @@ int main(int argc, char** argv) { return 0; } + Reference ccf; + std::pair resolvedClusterFile = ClusterConnectionFile::lookupClusterFileName(opt.clusterFile); + + try { + ccf = makeReference(resolvedClusterFile.first); + } catch (Error& e) { + if (e.code() == error_code_operation_cancelled) { + throw; + } + fprintf(stderr, "%s\n", ClusterConnectionFile::getErrorString(resolvedClusterFile, e).c_str()); + return 1; + } + + // Make sure that TLS configuration lines up with ":tls" prefix on coordinator addresses + if (auto errorMsg = checkTlsConfigAgainstCoordAddrs(ccf->getConnectionString())) { + fprintf(stderr, "ERROR: %s\n", errorMsg); + return 1; + } + try { API->selectApiVersion(opt.apiVersion); if (opt.useFutureProtocolVersion) { @@ -2188,7 +2201,7 @@ int main(int argc, char** argv) { return opt.exit_code; } Future memoryUsageMonitor = startMemoryUsageMonitor(opt.memLimit); - Future cliFuture = runCli(opt); + Future cliFuture = runCli(opt, ccf); Future timeoutFuture = opt.exit_timeout ? timeExit(opt.exit_timeout) : Never(); auto f = stopNetworkAfter(success(cliFuture) || timeoutFuture); API->runNetwork(); diff --git a/fdbcli/include/fdbcli/fdbcli.actor.h b/fdbcli/include/fdbcli/fdbcli.actor.h index 3df51b4677..dce68eb10b 100644 --- a/fdbcli/include/fdbcli/fdbcli.actor.h +++ b/fdbcli/include/fdbcli/fdbcli.actor.h @@ -95,6 +95,7 @@ extern const KeyRef advanceVersionSpecialKey; extern const KeyRef consistencyCheckSpecialKey; // coordinators extern const KeyRef clusterDescriptionSpecialKey; +extern const KeyRef configDBSpecialKey; extern const KeyRef coordinatorsAutoSpecialKey; extern const KeyRef coordinatorsProcessSpecialKey; // datadistribution @@ -119,7 +120,7 @@ extern const KeyRef ignoreSSFailureSpecialKey; extern const KeyRangeRef processClassSourceSpecialKeyRange; extern const KeyRangeRef processClassTypeSpecialKeyRange; // Other special keys -inline const KeyRef errorMsgSpecialKey = LiteralStringRef("\xff\xff/error_message"); +inline const KeyRef errorMsgSpecialKey = "\xff\xff/error_message"_sr; inline const KeyRef workerInterfacesVerifyOptionSpecialKey = "\xff\xff/management/options/worker_interfaces/verify"_sr; // help functions (Copied from fdbcli.actor.cpp) @@ -159,20 +160,16 @@ ACTOR Future configureCommandActor(Reference db, std::vector tokens, LineNoise* linenoise, Future warn); -// configuretenant command -ACTOR Future configureTenantCommandActor(Reference db, std::vector tokens); // consistency command ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens, bool intrans); +// consistency scan command +ACTOR Future consistencyScanCommandActor(Database localDb, std::vector tokens); // coordinators command ACTOR Future coordinatorsCommandActor(Reference db, std::vector tokens); -// createtenant command -ACTOR Future createTenantCommandActor(Reference db, std::vector tokens, int apiVersion); // datadistribution command ACTOR Future dataDistributionCommandActor(Reference db, std::vector tokens); -// deletetenant command -ACTOR Future deleteTenantCommandActor(Reference db, std::vector tokens, int apiVersion); // exclude command ACTOR Future excludeCommandActor(Reference db, std::vector tokens, Future warn); // expensive_data_check command @@ -188,8 +185,6 @@ ACTOR Future fileConfigureCommandActor(Reference db, bool force); // force_recovery_with_data_loss command ACTOR Future forceRecoveryWithDataLossCommandActor(Reference db, std::vector tokens); -// gettenant command -ACTOR Future getTenantCommandActor(Reference db, std::vector tokens, int apiVersion); // include command ACTOR Future includeCommandActor(Reference db, std::vector tokens); // kill command @@ -197,8 +192,6 @@ ACTOR Future killCommandActor(Reference db, Reference tr, std::vector tokens, std::map>* address_interface); -// listtenants command -ACTOR Future listTenantsCommandActor(Reference db, std::vector tokens, int apiVersion); // lock/unlock command ACTOR Future lockCommandActor(Reference db, std::vector tokens); ACTOR Future unlockDatabaseActor(Reference db, UID uid); @@ -215,6 +208,11 @@ ACTOR Future changeFeedCommandActor(Database localDb, ACTOR Future blobRangeCommandActor(Database localDb, Optional tenantEntry, std::vector tokens); + +// blobkey command +ACTOR Future blobKeyCommandActor(Database localDb, + Optional tenantEntry, + std::vector tokens); // maintenance command ACTOR Future setHealthyZone(Reference db, StringRef zoneId, double seconds, bool printWarning = false); ACTOR Future clearHealthyZone(Reference db, @@ -226,8 +224,6 @@ ACTOR Future profileCommandActor(Database db, Reference tr, std::vector tokens, bool intrans); -// renametenant command -ACTOR Future renameTenantCommandActor(Reference db, std::vector tokens, int apiVersion); // quota command ACTOR Future quotaCommandActor(Reference db, std::vector tokens); // setclass command @@ -244,6 +240,12 @@ ACTOR Future suspendCommandActor(Reference db, Reference tr, std::vector tokens, std::map>* address_interface); +// tenant command +Future tenantCommand(Reference db, std::vector tokens); +// tenant command compatibility layer +Future tenantCommandForwarder(Reference db, std::vector tokens); +// tenantgroup command +Future tenantGroupCommand(Reference db, std::vector tokens); // throttle command ACTOR Future throttleCommandActor(Reference db, std::vector tokens); // triggerteaminfolog command diff --git a/bindings/python/tests/fdbcli_tests.py b/fdbcli/tests/fdbcli_tests.py similarity index 79% rename from bindings/python/tests/fdbcli_tests.py rename to fdbcli/tests/fdbcli_tests.py index 552bba8f49..530c80f865 100755 --- a/bindings/python/tests/fdbcli_tests.py +++ b/fdbcli/tests/fdbcli_tests.py @@ -7,6 +7,7 @@ import subprocess import logging import functools import json +import tempfile import time import random from argparse import ArgumentParser, RawDescriptionHelpFormatter @@ -592,38 +593,105 @@ def triggerddteaminfolog(logger): output = run_fdbcli_command('triggerddteaminfolog') assert output == 'Triggered team info logging in data distribution.' +def setup_tenants(tenants): + command = '; '.join(['tenant create %s' % t for t in tenants]) + run_fdbcli_command(command) + +def clear_database_and_tenants(): + run_fdbcli_command('writemode on; option on SPECIAL_KEY_SPACE_ENABLE_WRITES; clearrange "" \\xff; clearrange \\xff\\xff/management/tenant/map/ \\xff\\xff/management/tenant/map0') + +def run_tenant_test(test_func): + test_func() + clear_database_and_tenants() @enable_logging() -def tenants(logger): - output = run_fdbcli_command('listtenants') - assert output == 'The cluster has no tenants' +def tenant_create(logger): + output1 = run_fdbcli_command('tenant create tenant') + assert output1 == 'The tenant `tenant\' has been created' - output = run_fdbcli_command('createtenant tenant') - assert output == 'The tenant `tenant\' has been created' - - output = run_fdbcli_command('createtenant tenant2 tenant_group=tenant_group2') + output = run_fdbcli_command('tenant create tenant2 tenant_group=tenant_group2') assert output == 'The tenant `tenant2\' has been created' - output = run_fdbcli_command('listtenants') + output = run_fdbcli_command_and_get_error('tenant create tenant') + assert output == 'ERROR: A tenant with the given name already exists (2132)' + +@enable_logging() +def tenant_delete(logger): + setup_tenants(['tenant', 'tenant2']) + run_fdbcli_command('writemode on; usetenant tenant2; set tenant_test value') + + # delete a tenant while the fdbcli is using that tenant + process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) + cmd_sequence = ['writemode on', 'usetenant tenant', 'tenant delete tenant', 'get tenant_test', 'defaulttenant', 'usetenant tenant'] + output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) + + lines = output.decode().strip().split('\n')[-6:] + error_lines = error_output.decode().strip().split('\n')[-2:] + assert lines[0] == 'Using tenant `tenant\'' + assert lines[1] == 'The tenant `tenant\' has been deleted' + assert lines[2] == 'WARNING: the active tenant was deleted. Use the `usetenant\' or `defaulttenant\'' + assert lines[3] == 'command to choose a new tenant.' + assert error_lines[0] == 'ERROR: Tenant does not exist (2131)' + assert lines[5] == 'Using the default tenant' + assert error_lines[1] == 'ERROR: Tenant `tenant\' does not exist' + + # delete a non-empty tenant + process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) + cmd_sequence = ['writemode on', 'tenant delete tenant2', 'usetenant tenant2', 'clear tenant_test', 'defaulttenant', 'tenant delete tenant2'] + output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) + + lines = output.decode().strip().split('\n')[-4:] + error_lines = error_output.decode().strip().split('\n')[-1:] + assert error_lines[0] == 'ERROR: Cannot delete a non-empty tenant (2133)' + assert lines[0] == 'Using tenant `tenant2\'' + assert lines[1].startswith('Committed') + assert lines[2] == 'Using the default tenant' + assert lines[3] == 'The tenant `tenant2\' has been deleted' + + # delete a non-existing tenant + output = run_fdbcli_command_and_get_error('tenant delete tenant') + assert output == 'ERROR: Tenant does not exist (2131)' + +@enable_logging() +def tenant_list(logger): + output = run_fdbcli_command('tenant list') + assert output == 'The cluster has no tenants' + + setup_tenants(['tenant', 'tenant2']) + + output = run_fdbcli_command('tenant list') assert output == '1. tenant\n 2. tenant2' - output = run_fdbcli_command('listtenants a z 1') + output = run_fdbcli_command('tenant list a z 1') assert output == '1. tenant' - output = run_fdbcli_command('listtenants a tenant2') + output = run_fdbcli_command('tenant list a tenant2') assert output == '1. tenant' - output = run_fdbcli_command('listtenants tenant2 z') + output = run_fdbcli_command('tenant list tenant2 z') assert output == '1. tenant2' - output = run_fdbcli_command('gettenant tenant') + output = run_fdbcli_command('tenant list a b') + assert output == 'The cluster has no tenants in the specified range' + + output = run_fdbcli_command_and_get_error('tenant list b a') + assert output == 'ERROR: end must be larger than begin' + + output = run_fdbcli_command_and_get_error('tenant list a b 12x') + assert output == 'ERROR: invalid limit `12x\'' + +@enable_logging() +def tenant_get(logger): + setup_tenants(['tenant', 'tenant2 tenant_group=tenant_group2']) + + output = run_fdbcli_command('tenant get tenant') lines = output.split('\n') assert len(lines) == 3 assert lines[0].strip().startswith('id: ') assert lines[1].strip().startswith('prefix: ') assert lines[2].strip() == 'tenant state: ready' - output = run_fdbcli_command('gettenant tenant JSON') + output = run_fdbcli_command('tenant get tenant JSON') json_output = json.loads(output, strict=False) assert(len(json_output) == 2) assert('tenant' in json_output) @@ -637,7 +705,7 @@ def tenants(logger): assert('printable' in json_output['tenant']['prefix']) assert(json_output['tenant']['tenant_state'] == 'ready') - output = run_fdbcli_command('gettenant tenant2') + output = run_fdbcli_command('tenant get tenant2') lines = output.split('\n') assert len(lines) == 4 assert lines[0].strip().startswith('id: ') @@ -645,7 +713,7 @@ def tenants(logger): assert lines[2].strip() == 'tenant state: ready' assert lines[3].strip() == 'tenant group: tenant_group2' - output = run_fdbcli_command('gettenant tenant2 JSON') + output = run_fdbcli_command('tenant get tenant2 JSON') json_output = json.loads(output, strict=False) assert(len(json_output) == 2) assert('tenant' in json_output) @@ -660,36 +728,57 @@ def tenants(logger): assert('base64' in json_output['tenant']['tenant_group']) assert(json_output['tenant']['tenant_group']['printable'] == 'tenant_group2') - output = run_fdbcli_command('configuretenant tenant tenant_group=tenant_group1') +@enable_logging() +def tenant_configure(logger): + setup_tenants(['tenant']) + + output = run_fdbcli_command('tenant configure tenant tenant_group=tenant_group1') assert output == 'The configuration for tenant `tenant\' has been updated' - output = run_fdbcli_command('gettenant tenant') + output = run_fdbcli_command('tenant get tenant') lines = output.split('\n') assert len(lines) == 4 assert lines[3].strip() == 'tenant group: tenant_group1' - output = run_fdbcli_command('configuretenant tenant unset tenant_group') + output = run_fdbcli_command('tenant configure tenant unset tenant_group') assert output == 'The configuration for tenant `tenant\' has been updated' - output = run_fdbcli_command('gettenant tenant') + output = run_fdbcli_command('tenant get tenant') lines = output.split('\n') assert len(lines) == 3 - output = run_fdbcli_command_and_get_error('configuretenant tenant tenant_group=tenant_group1 tenant_group=tenant_group2') + output = run_fdbcli_command_and_get_error('tenant configure tenant tenant_group=tenant_group1 tenant_group=tenant_group2') assert output == 'ERROR: configuration parameter `tenant_group\' specified more than once.' - output = run_fdbcli_command_and_get_error('configuretenant tenant unset') + output = run_fdbcli_command_and_get_error('tenant configure tenant unset') assert output == 'ERROR: `unset\' specified without a configuration parameter.' - output = run_fdbcli_command_and_get_error('configuretenant tenant unset tenant_group=tenant_group1') + output = run_fdbcli_command_and_get_error('tenant configure tenant unset tenant_group=tenant_group1') assert output == 'ERROR: unrecognized configuration parameter `tenant_group=tenant_group1\'.' - output = run_fdbcli_command_and_get_error('configuretenant tenant tenant_group') + output = run_fdbcli_command_and_get_error('tenant configure tenant tenant_group') assert output == 'ERROR: invalid configuration string `tenant_group\'. String must specify a value using `=\'.' - output = run_fdbcli_command_and_get_error('configuretenant tenant3 tenant_group=tenant_group1') + output = run_fdbcli_command_and_get_error('tenant configure tenant3 tenant_group=tenant_group1') assert output == 'ERROR: Tenant does not exist (2131)' +@enable_logging() +def tenant_rename(logger): + setup_tenants(['tenant', 'tenant2']) + + output = run_fdbcli_command('tenant rename tenant tenant3') + assert output == 'The tenant `tenant\' has been renamed to `tenant3\'' + + output = run_fdbcli_command_and_get_error('tenant rename tenant tenant4') + assert output == 'ERROR: Tenant does not exist (2131)' + + output = run_fdbcli_command_and_get_error('tenant rename tenant2 tenant3') + assert output == 'ERROR: A tenant with the given name already exists (2132)' + +@enable_logging() +def tenant_usetenant(logger): + setup_tenants(['tenant', 'tenant2']) + output = run_fdbcli_command('usetenant') assert output == 'Using the default tenant' @@ -721,44 +810,103 @@ def tenants(logger): assert lines[3] == '`tenant_test\' is `tenant2\'' process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env) - cmd_sequence = ['usetenant tenant', 'get tenant_test', 'defaulttenant', 'get tenant_test'] + cmd_sequence = ['usetenant tenant', 'get tenant_test', 'usetenant tenant2', 'get tenant_test', 'defaulttenant', 'get tenant_test'] output, _ = process.communicate(input='\n'.join(cmd_sequence).encode()) - lines = output.decode().strip().split('\n')[-4:] + lines = output.decode().strip().split('\n')[-6:] assert lines[0] == 'Using tenant `tenant\'' assert lines[1] == '`tenant_test\' is `tenant\'' - assert lines[2] == 'Using the default tenant' - assert lines[3] == '`tenant_test\' is `default_tenant\'' + assert lines[2] == 'Using tenant `tenant2\'' + assert lines[3] == '`tenant_test\' is `tenant2\'' + assert lines[4] == 'Using the default tenant' + assert lines[5] == '`tenant_test\' is `default_tenant\'' - process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) - cmd_sequence = ['writemode on', 'usetenant tenant', 'clear tenant_test', - 'deletetenant tenant', 'get tenant_test', 'defaulttenant', 'usetenant tenant'] - output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) +@enable_logging() +def tenant_old_commands(logger): + create_output = run_fdbcli_command('tenant create tenant') + list_output = run_fdbcli_command('tenant list') + get_output = run_fdbcli_command('tenant get tenant') + # Run the gettenant command here because the ID will be different in the second block + get_output_old = run_fdbcli_command('gettenant tenant') + configure_output = run_fdbcli_command('tenant configure tenant tenant_group=tenant_group1') + rename_output = run_fdbcli_command('tenant rename tenant tenant2') + delete_output = run_fdbcli_command('tenant delete tenant2') - lines = output.decode().strip().split('\n')[-7:] - error_lines = error_output.decode().strip().split('\n')[-2:] - assert lines[0] == 'Using tenant `tenant\'' - assert lines[1].startswith('Committed') - assert lines[2] == 'The tenant `tenant\' has been deleted' - assert lines[3] == 'WARNING: the active tenant was deleted. Use the `usetenant\' or `defaulttenant\'' - assert lines[4] == 'command to choose a new tenant.' - assert error_lines[0] == 'ERROR: Tenant does not exist (2131)' - assert lines[6] == 'Using the default tenant' - assert error_lines[1] == 'ERROR: Tenant `tenant\' does not exist' + create_output_old = run_fdbcli_command('createtenant tenant') + list_output_old = run_fdbcli_command('listtenants') + configure_output_old = run_fdbcli_command('configuretenant tenant tenant_group=tenant_group1') + rename_output_old = run_fdbcli_command('renametenant tenant tenant2') + delete_output_old = run_fdbcli_command('deletetenant tenant2') - process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) - cmd_sequence = ['writemode on', 'deletetenant tenant2', 'usetenant tenant2', 'clear tenant_test', 'defaulttenant', 'deletetenant tenant2'] - output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) + assert create_output == create_output_old + assert list_output == list_output_old + assert get_output == get_output_old + assert configure_output == configure_output_old + assert rename_output == rename_output_old + assert delete_output == delete_output_old - lines = output.decode().strip().split('\n')[-4:] - error_lines = error_output.decode().strip().split('\n')[-1:] - assert error_lines[0] == 'ERROR: Cannot delete a non-empty tenant (2133)' - assert lines[0] == 'Using tenant `tenant2\'' - assert lines[1].startswith('Committed') - assert lines[2] == 'Using the default tenant' - assert lines[3] == 'The tenant `tenant2\' has been deleted' +@enable_logging() +def tenant_group_list(logger): + output = run_fdbcli_command('tenantgroup list') + assert output == 'The cluster has no tenant groups' - run_fdbcli_command('writemode on; clear tenant_test') + setup_tenants(['tenant', 'tenant2 tenant_group=tenant_group2', 'tenant3 tenant_group=tenant_group3']) + + output = run_fdbcli_command('tenantgroup list') + assert output == '1. tenant_group2\n 2. tenant_group3' + + output = run_fdbcli_command('tenantgroup list a z 1') + assert output == '1. tenant_group2' + + output = run_fdbcli_command('tenantgroup list a tenant_group3') + assert output == '1. tenant_group2' + + output = run_fdbcli_command('tenantgroup list tenant_group3 z') + assert output == '1. tenant_group3' + + output = run_fdbcli_command('tenantgroup list a b') + assert output == 'The cluster has no tenant groups in the specified range' + + output = run_fdbcli_command_and_get_error('tenantgroup list b a') + assert output == 'ERROR: end must be larger than begin' + + output = run_fdbcli_command_and_get_error('tenantgroup list a b 12x') + assert output == 'ERROR: invalid limit `12x\'' + +@enable_logging() +def tenant_group_get(logger): + setup_tenants(['tenant tenant_group=tenant_group']) + + output = run_fdbcli_command('tenantgroup get tenant_group') + assert output == 'The tenant group is present in the cluster' + + output = run_fdbcli_command('tenantgroup get tenant_group JSON') + json_output = json.loads(output, strict=False) + assert(len(json_output) == 2) + assert('tenant_group' in json_output) + assert(json_output['type'] == 'success') + assert(len(json_output['tenant_group']) == 0) + + output = run_fdbcli_command_and_get_error('tenantgroup get tenant_group2') + assert output == 'ERROR: tenant group not found' + + output = run_fdbcli_command('tenantgroup get tenant_group2 JSON') + json_output = json.loads(output, strict=False) + assert(len(json_output) == 2) + assert(json_output['type'] == 'error') + assert(json_output['error'] == 'tenant group not found') + +def tenants(): + run_tenant_test(tenant_create) + run_tenant_test(tenant_delete) + run_tenant_test(tenant_list) + run_tenant_test(tenant_get) + run_tenant_test(tenant_configure) + run_tenant_test(tenant_rename) + run_tenant_test(tenant_usetenant) + run_tenant_test(tenant_old_commands) + run_tenant_test(tenant_group_list) + run_tenant_test(tenant_group_get) def integer_options(): process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) @@ -770,6 +918,24 @@ def integer_options(): assert lines[1].startswith('Committed') assert error_output == b'' +def tls_address_suffix(): + # fdbcli shall prevent a non-TLS fdbcli run from connecting to an all-TLS cluster + preamble = 'eNW1yf1M:eNW1yf1M@' + num_server_addrs = [1, 2, 5] + err_output_server_tls = "ERROR: fdbcli is not configured with TLS, but all of the coordinators have TLS addresses." + + with tempfile.TemporaryDirectory() as tmpdir: + cluster_fn = tmpdir + "/fdb.cluster" + for num_server_addr in num_server_addrs: + with open(cluster_fn, "w") as fp: + fp.write(preamble + ",".join( + ["127.0.0.1:{}:tls".format(4000 + addr_idx) for addr_idx in range(num_server_addr)])) + fp.close() + fdbcli_process = subprocess.run(command_template[:2] + [cluster_fn], capture_output=True) + assert fdbcli_process.returncode != 0 + err_out = fdbcli_process.stderr.decode("utf8").strip() + assert err_out == err_output_server_tls, f"unexpected output: {err_out}" + if __name__ == '__main__': parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter, description=""" @@ -816,6 +982,7 @@ if __name__ == '__main__': tenants() versionepoch() integer_options() + tls_address_suffix() else: assert args.process_number > 1, "Process number should be positive" coordinators() diff --git a/fdbclient/Atomic.cpp b/fdbclient/Atomic.cpp new file mode 100644 index 0000000000..f2614e3881 --- /dev/null +++ b/fdbclient/Atomic.cpp @@ -0,0 +1,47 @@ +/* + * Atomic.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/Atomic.h" +#include "flow/Arena.h" +#include "flow/UnitTest.h" + +void forceLinkAtomicTests() {} + +TEST_CASE("/Atomic/DoAppendIfFits") { + Arena arena; + { + Value existingValue = ValueRef(arena, "existing"_sr); + Value otherOperand = ValueRef(arena, "other"_sr); + auto result = doAppendIfFits(existingValue, otherOperand, arena); + ASSERT(compare("existingother"_sr, result) == 0); + } + { + Value existingValue = makeString(CLIENT_KNOBS->VALUE_SIZE_LIMIT - 1, arena); + Value otherOperand = makeString(2, arena); + deterministicRandom()->randomBytes(mutateString(existingValue), existingValue.size()); + deterministicRandom()->randomBytes(mutateString(otherOperand), otherOperand.size()); + // Appended values cannot fit in result, should return existingValue + auto result = doAppendIfFits(existingValue, otherOperand, arena); + ASSERT(compare(existingValue, result) == 0); + } + return Void(); +} + +// TODO: Add more unit tests for atomic operations defined in Atomic.h diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index cb999a6e12..ab136878c4 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -22,6 +22,10 @@ #include #include "fdbclient/BackupAgent.actor.h" +#include "fdbclient/BlobCipher.h" +#include "fdbclient/GetEncryptCipherKeys.actor.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/Metacluster.h" #include "fdbrpc/simulator.h" #include "flow/ActorCollection.h" #include "flow/actorcompiler.h" // has to be last include @@ -253,16 +257,18 @@ std::pair decodeBKMutationLogKey(Key key) { bigEndian32(*(int32_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t) + sizeof(int64_t)))); } -void decodeBackupLogValue(Arena& arena, - VectorRef& result, - int& mutationSize, - StringRef value, - StringRef addPrefix, - StringRef removePrefix, - Version version, - Reference> key_version) { +ACTOR static Future decodeBackupLogValue(Arena* arena, + VectorRef* result, + VectorRef>* encryptedResult, + int* mutationSize, + Standalone value, + Key addPrefix, + Key removePrefix, + Version version, + Reference> key_version, + Database cx) { try { - uint64_t offset(0); + state uint64_t offset(0); uint64_t protocolVersion = 0; memcpy(&protocolVersion, value.begin(), sizeof(uint64_t)); offset += sizeof(uint64_t); @@ -274,36 +280,48 @@ void decodeBackupLogValue(Arena& arena, throw incompatible_protocol_version(); } - uint32_t totalBytes = 0; + state uint32_t totalBytes = 0; memcpy(&totalBytes, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); - uint32_t consumed = 0; + state uint32_t consumed = 0; if (totalBytes + offset > value.size()) throw restore_missing_data(); - int originalOffset = offset; + state int originalOffset = offset; while (consumed < totalBytes) { uint32_t type = 0; memcpy(&type, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); - uint32_t len1 = 0; + state uint32_t len1 = 0; memcpy(&len1, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); - uint32_t len2 = 0; + state uint32_t len2 = 0; memcpy(&len2, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); ASSERT(offset + len1 + len2 <= value.size() && isValidMutationType(type)); - MutationRef logValue; - Arena tempArena; + state MutationRef logValue; + state Arena tempArena; logValue.type = type; logValue.param1 = value.substr(offset, len1); offset += len1; logValue.param2 = value.substr(offset, len2); offset += len2; + state Optional encryptedLogValue = Optional(); + + // Decrypt mutation ref if encrypted + if (logValue.isEncrypted()) { + encryptedLogValue = logValue; + Reference const> dbInfo = cx->clientInfo; + TextAndHeaderCipherKeys cipherKeys = + wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::BACKUP)); + logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP); + } + ASSERT(!logValue.isEncrypted()); + MutationRef originalLogValue = logValue; if (logValue.type == MutationRef::ClearRange) { KeyRangeRef range(logValue.param1, logValue.param2); @@ -311,7 +329,7 @@ void decodeBackupLogValue(Arena& arena, for (auto r : ranges) { if (version > r.value() && r.value() != invalidVersion) { KeyRef minKey = std::min(r.range().end, range.end); - if (minKey == (removePrefix == StringRef() ? normalKeys.end : strinc(removePrefix))) { + if (minKey == (removePrefix == StringRef() ? allKeys.end : strinc(removePrefix))) { logValue.param1 = std::max(r.range().begin, range.begin); if (removePrefix.size()) { logValue.param1 = logValue.param1.removePrefix(removePrefix); @@ -319,9 +337,9 @@ void decodeBackupLogValue(Arena& arena, if (addPrefix.size()) { logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); } - logValue.param2 = addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix, tempArena); - result.push_back_deep(arena, logValue); - mutationSize += logValue.expectedSize(); + logValue.param2 = addPrefix == StringRef() ? allKeys.end : strinc(addPrefix, tempArena); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); } else { logValue.param1 = std::max(r.range().begin, range.begin); logValue.param2 = minKey; @@ -333,8 +351,13 @@ void decodeBackupLogValue(Arena& arena, logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); logValue.param2 = logValue.param2.withPrefix(addPrefix, tempArena); } - result.push_back_deep(arena, logValue); - mutationSize += logValue.expectedSize(); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); + } + if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) { + encryptedResult->push_back_deep(*arena, encryptedLogValue); + } else { + encryptedResult->push_back_deep(*arena, Optional()); } } } @@ -348,8 +371,15 @@ void decodeBackupLogValue(Arena& arena, if (addPrefix.size()) { logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); } - result.push_back_deep(arena, logValue); - mutationSize += logValue.expectedSize(); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); + // If we did not remove/add prefixes to the mutation then keep the original encrypted mutation so we + // do not have to re-encrypt unnecessarily + if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) { + encryptedResult->push_back_deep(*arena, encryptedLogValue); + } else { + encryptedResult->push_back_deep(*arena, Optional()); + } } } @@ -374,6 +404,7 @@ void decodeBackupLogValue(Arena& arena, .detail("Value", value); throw; } + return Void(); } static double lastErrorTime = 0; @@ -414,7 +445,7 @@ ACTOR Future readCommitted(Database cx, loop { try { state GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, - (g_network->isSimulated() && !g_simulator.speedUpSimulation) + (g_network->isSimulated() && !g_simulator->speedUpSimulation) ? CLIENT_KNOBS->BACKUP_SIMULATED_LIMIT_BYTES : CLIENT_KNOBS->BACKUP_GET_RANGE_LIMIT_BYTES); @@ -493,7 +524,7 @@ ACTOR Future readCommitted(Database cx, loop { try { state GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, - (g_network->isSimulated() && !g_simulator.speedUpSimulation) + (g_network->isSimulated() && !g_simulator->speedUpSimulation) ? CLIENT_KNOBS->BACKUP_SIMULATED_LIMIT_BYTES : CLIENT_KNOBS->BACKUP_GET_RANGE_LIMIT_BYTES); @@ -614,21 +645,24 @@ ACTOR Future dumpData(Database cx, state int mutationSize = 0; loop { try { - RCGroup group = waitNext(results.getFuture()); + state RCGroup group = waitNext(results.getFuture()); lock->release(group.items.expectedSize()); BinaryWriter bw(Unversioned()); for (int i = 0; i < group.items.size(); ++i) { bw.serializeBytes(group.items[i].value); } - decodeBackupLogValue(req.arena, - req.transaction.mutations, - mutationSize, - bw.toValue(), - addPrefix, - removePrefix, - group.groupKey, - keyVersion); + Standalone value = bw.toValue(); + wait(decodeBackupLogValue(&req.arena, + &req.transaction.mutations, + &req.transaction.encryptedMutations, + &mutationSize, + value, + addPrefix, + removePrefix, + group.groupKey, + keyVersion, + cx)); newBeginVersion = group.groupKey + 1; if (mutationSize >= CLIENT_KNOBS->BACKUP_LOG_WRITE_BATCH_MAX_SIZE) { break; @@ -652,8 +686,10 @@ ACTOR Future dumpData(Database cx, Key rangeEnd = getApplyKey(newBeginVersion, uid); req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::SetValue, applyBegin, versionKey)); + req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(applyBegin)); req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::ClearRange, rangeBegin, rangeEnd)); + req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(rangeBegin)); // The commit request contains no read conflict ranges, so regardless of what read version we @@ -968,10 +1004,9 @@ ACTOR Future cleanupLogMutations(Database cx, Value destUidValue, bool del .get(BackupAgentBase::keySourceStates) .get(currLogUid) .pack(DatabaseBackupAgent::keyStateStatus)); - state Future> foundBackupKey = - tr->get(Subspace(currLogUid.withPrefix(LiteralStringRef("uid->config/")) - .withPrefix(fileBackupPrefixRange.begin)) - .pack(LiteralStringRef("stateEnum"))); + state Future> foundBackupKey = tr->get( + Subspace(currLogUid.withPrefix("uid->config/"_sr).withPrefix(fileBackupPrefixRange.begin)) + .pack("stateEnum"_sr)); wait(success(foundDRKey) && success(foundBackupKey)); if (foundDRKey.get().present() && foundBackupKey.get().present()) { @@ -1165,3 +1200,38 @@ Standalone BackupAgentBase::getCurrentTime() { } std::string const BackupAgentBase::defaultTagName = "default"; + +void addDefaultBackupRanges(Standalone>& backupKeys) { + backupKeys.push_back_deep(backupKeys.arena(), normalKeys); + + for (auto& r : getSystemBackupRanges()) { + backupKeys.push_back_deep(backupKeys.arena(), r); + } +} + +VectorRef const& getSystemBackupRanges() { + static Standalone> systemBackupRanges; + if (systemBackupRanges.empty()) { + systemBackupRanges.push_back_deep(systemBackupRanges.arena(), prefixRange(TenantMetadata::subspace())); + systemBackupRanges.push_back_deep(systemBackupRanges.arena(), + singleKeyRange(MetaclusterMetadata::metaclusterRegistration().key)); + } + + return systemBackupRanges; +} + +KeyRangeMap const& systemBackupMutationMask() { + static KeyRangeMap mask; + if (mask.size() == 1) { + for (auto r : getSystemBackupRanges()) { + mask.insert(r, true); + } + } + + return mask; +} + +KeyRangeRef const& getDefaultBackupSharedRange() { + static KeyRangeRef defaultSharedRange(""_sr, ""_sr); + return defaultSharedRange; +} diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp index b222153517..5ad037d993 100644 --- a/fdbclient/BackupContainerFileSystem.actor.cpp +++ b/fdbclient/BackupContainerFileSystem.actor.cpp @@ -906,6 +906,7 @@ public: ACTOR static Future> getRestoreSet(Reference bc, Version targetVersion, VectorRef keyRangesFilter, + Optional cx, bool logsOnly = false, Version beginVersion = invalidVersion) { for (const auto& range : keyRangesFilter) { @@ -982,7 +983,7 @@ public: restorable.ranges.end(), [file = rit->first](const RangeFile f) { return f.fileName == file; }); ASSERT(it != restorable.ranges.end()); - KeyRange result = wait(bc->getSnapshotFileKeyRange(*it)); + KeyRange result = wait(bc->getSnapshotFileKeyRange(*it, cx)); ASSERT(rit->second.begin <= result.begin && rit->second.end >= result.end); } } @@ -1349,7 +1350,9 @@ Future BackupContainerFileSystem::expireData(Version expireEndVersion, Reference::addRef(this), expireEndVersion, force, progress, restorableBeginVersion); } -ACTOR static Future getSnapshotFileKeyRange_impl(Reference bc, RangeFile file) { +ACTOR static Future getSnapshotFileKeyRange_impl(Reference bc, + RangeFile file, + Optional cx) { state int readFileRetries = 0; state bool beginKeySet = false; state Key beginKey; @@ -1361,7 +1364,8 @@ ACTOR static Future getSnapshotFileKeyRange_impl(Reference(file.blockSize, file.fileSize - j); - Standalone> blockData = wait(fileBackup::decodeRangeFileBlock(inFile, j, len)); + Standalone> blockData = + wait(fileBackup::decodeRangeFileBlock(inFile, j, len, cx)); if (!beginKeySet) { beginKey = blockData.front().key; beginKeySet = true; @@ -1434,17 +1438,18 @@ ACTOR static Future> readVersionProperty(Reference BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file) { +Future BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file, Optional cx) { ASSERT(g_network->isSimulated()); - return getSnapshotFileKeyRange_impl(Reference::addRef(this), file); + return getSnapshotFileKeyRange_impl(Reference::addRef(this), file, cx); } Future> BackupContainerFileSystem::getRestoreSet(Version targetVersion, + Optional cx, VectorRef keyRangesFilter, bool logsOnly, Version beginVersion) { return BackupContainerFileSystemImpl::getRestoreSet( - Reference::addRef(this), targetVersion, keyRangesFilter, logsOnly, beginVersion); + Reference::addRef(this), targetVersion, keyRangesFilter, cx, logsOnly, beginVersion); } Future> BackupContainerFileSystem::VersionProperty::get() { @@ -1666,7 +1671,8 @@ ACTOR static Future testWriteSnapshotFile(Reference file, Key ACTOR Future testBackupContainer(std::string url, Optional proxy, - Optional encryptionKeyFileName) { + Optional encryptionKeyFileName, + Optional cx) { state FlowLock lock(100e6); if (encryptionKeyFileName.present()) { @@ -1697,7 +1703,7 @@ ACTOR Future testBackupContainer(std::string url, // List of sizes to use to test edge cases on underlying file implementations state std::vector fileSizes = { 0 }; - if (StringRef(url).startsWith(LiteralStringRef("blob"))) { + if (StringRef(url).startsWith("blob"_sr)) { fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE); fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE + 10); } @@ -1705,8 +1711,8 @@ ACTOR Future testBackupContainer(std::string url, loop { state Version logStart = v; state int kvfiles = deterministicRandom()->randomInt(0, 3); - state Key begin = LiteralStringRef(""); - state Key end = LiteralStringRef(""); + state Key begin = ""_sr; + state Key end = ""_sr; state int blockSize = 3 * sizeof(uint32_t) + begin.size() + end.size() + 8; while (kvfiles > 0) { @@ -1773,13 +1779,13 @@ ACTOR Future testBackupContainer(std::string url, for (; i < listing.snapshots.size(); ++i) { { // Ensure we can still restore to the latest version - Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); + Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get(), cx)); ASSERT(rest.present()); } { // Ensure we can restore to the end version of snapshot i - Optional rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion)); + Optional rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion, cx)); ASSERT(rest.present()); } @@ -1820,14 +1826,16 @@ ACTOR Future testBackupContainer(std::string url, } TEST_CASE("/backup/containers/localdir/unencrypted") { - wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {})); + wait(testBackupContainer( + format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}, {})); return Void(); } TEST_CASE("/backup/containers/localdir/encrypted") { wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, - format("%s/test_encryption_key", params.getDataDir().c_str()))); + format("%s/test_encryption_key", params.getDataDir().c_str()), + {})); return Void(); } @@ -1835,7 +1843,7 @@ TEST_CASE("/backup/containers/url") { if (!g_network->isSimulated()) { const char* url = getenv("FDB_TEST_BACKUP_URL"); ASSERT(url != nullptr); - wait(testBackupContainer(url, {}, {})); + wait(testBackupContainer(url, {}, {}, {})); } return Void(); } diff --git a/fdbclient/BackupContainerLocalDirectory.actor.cpp b/fdbclient/BackupContainerLocalDirectory.actor.cpp index 528910dabc..51abc24678 100644 --- a/fdbclient/BackupContainerLocalDirectory.actor.cpp +++ b/fdbclient/BackupContainerLocalDirectory.actor.cpp @@ -103,16 +103,15 @@ ACTOR static Future listFiles_impl(st // Remove .lnk files from results, they are a side effect of a backup that was *read* during simulation. See // openFile() above for more info on why they are created. if (g_network->isSimulated()) - files.erase( - std::remove_if(files.begin(), - files.end(), - [](std::string const& f) { return StringRef(f).endsWith(LiteralStringRef(".lnk")); }), - files.end()); + files.erase(std::remove_if(files.begin(), + files.end(), + [](std::string const& f) { return StringRef(f).endsWith(".lnk"_sr); }), + files.end()); for (const auto& f : files) { // Hide .part or .temp files. StringRef s(f); - if (!s.endsWith(LiteralStringRef(".part")) && !s.endsWith(LiteralStringRef(".temp"))) + if (!s.endsWith(".part"_sr) && !s.endsWith(".temp"_sr)) results.push_back({ f.substr(m_path.size() + 1), ::fileSize(f) }); } @@ -227,10 +226,10 @@ Future> BackupContainerLocalDirectory::readFile(const std: throw file_not_found(); } - if (g_simulator.getCurrentProcess()->uid == UID()) { + if (g_simulator->getCurrentProcess()->uid == UID()) { TraceEvent(SevError, "BackupContainerReadFileOnUnsetProcessID").log(); } - std::string uniquePath = fullPath + "." + g_simulator.getCurrentProcess()->uid.toString() + ".lnk"; + std::string uniquePath = fullPath + "." + g_simulator->getCurrentProcess()->uid.toString() + ".lnk"; unlink(uniquePath.c_str()); ASSERT(symlink(basename(path).c_str(), uniquePath.c_str()) == 0); fullPath = uniquePath; diff --git a/flow/BlobCipher.cpp b/fdbclient/BlobCipher.cpp similarity index 55% rename from flow/BlobCipher.cpp rename to fdbclient/BlobCipher.cpp index 123f63fef0..09a8bbd5ee 100644 --- a/flow/BlobCipher.cpp +++ b/fdbclient/BlobCipher.cpp @@ -18,8 +18,9 @@ * limitations under the License. */ -#include "flow/BlobCipher.h" +#include "fdbclient/BlobCipher.h" +#include "fdbclient/Knobs.h" #include "flow/Arena.h" #include "flow/EncryptUtils.h" #include "flow/Knobs.h" @@ -27,6 +28,7 @@ #include "flow/FastRef.h" #include "flow/IRandom.h" #include "flow/ITrace.h" +#include "flow/Platform.h" #include "flow/flow.h" #include "flow/network.h" #include "flow/Trace.h" @@ -48,11 +50,41 @@ #define BLOB_CIPHER_DEBUG false -namespace { -bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) { - return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST; +// BlobCipherMetrics methods + +BlobCipherMetrics::CounterSet::CounterSet(CounterCollection& cc, std::string name) + : encryptCPUTimeNS(name + "EncryptCPUTimeNS", cc), decryptCPUTimeNS(name + "DecryptCPUTimeNS", cc), + getCipherKeysLatency(name + "GetCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), + getLatestCipherKeysLatency(name + "GetLatestCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE) {} + +BlobCipherMetrics::BlobCipherMetrics() + : cc("BlobCipher"), cipherKeyCacheHit("CipherKeyCacheHit", cc), cipherKeyCacheMiss("CipherKeyCacheMiss", cc), + cipherKeyCacheExpired("CipherKeyCacheExpired", cc), latestCipherKeyCacheHit("LatestCipherKeyCacheHit", cc), + latestCipherKeyCacheMiss("LatestCipherKeyCacheMiss", cc), + latestCipherKeyCacheNeedsRefresh("LatestCipherKeyCacheNeedsRefresh", cc), + getCipherKeysLatency("GetCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), + getLatestCipherKeysLatency("GetLatestCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), + counterSets({ CounterSet(cc, "TLog"), + CounterSet(cc, "KVMemory"), + CounterSet(cc, "KVRedwood"), + CounterSet(cc, "BlobGranule"), + CounterSet(cc, "Backup"), + CounterSet(cc, "Test") }) { + specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); }); + traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc); } -} // namespace // BlobCipherKey class methods @@ -114,7 +146,7 @@ void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId, expireAtTS = expireAt; #if BLOB_CIPHER_DEBUG - TraceEvent(SevDebug, "BlobCipher.KeyInit") + TraceEvent(SevDebug, "BlobCipherKeyInit") .detail("DomainId", domainId) .detail("BaseCipherId", baseCipherId) .detail("BaseCipherLen", baseCipherLen) @@ -130,10 +162,10 @@ void BlobCipherKey::applyHmacSha256Derivation() { memcpy(&buf[0], baseCipher.get(), baseCipherLen); memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt)); HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen); - StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), arena); - std::copy(digest.begin(), digest.end(), cipher.get()); - if (digest.size() < AES_256_KEY_LENGTH) { - memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size()); + unsigned int digestLen = hmacGen.digest( + { { &buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt) } }, cipher.get(), AUTH_TOKEN_HMAC_SHA_SIZE); + if (digestLen < AES_256_KEY_LENGTH) { + memcpy(cipher.get() + digestLen, buf, AES_256_KEY_LENGTH - digestLen); } } @@ -144,17 +176,15 @@ void BlobCipherKey::reset() { // BlobKeyIdCache class methods -BlobCipherKeyIdCache::BlobCipherKeyIdCache() - : domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(), latestRandomSalt() {} - -BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId) - : domainId(dId), latestBaseCipherKeyId(), latestRandomSalt() { - TraceEvent(SevInfo, "BlobCipher.KeyIdCacheInit").detail("DomainId", domainId); +BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId, size_t* sizeStat) + : domainId(dId), latestBaseCipherKeyId(), latestRandomSalt(), sizeStat(sizeStat) { + ASSERT(sizeStat != nullptr); + TraceEvent(SevInfo, "BlobCipherKeyIdCacheInit").detail("DomainId", domainId); } BlobCipherKeyIdCacheKey BlobCipherKeyIdCache::getCacheKey(const EncryptCipherBaseKeyId& baseCipherKeyId, const EncryptCipherRandomSalt& salt) { - if (baseCipherKeyId == ENCRYPT_INVALID_CIPHER_KEY_ID || salt == ENCRYPT_INVALID_RANDOM_SALT) { + if (baseCipherKeyId == INVALID_ENCRYPT_CIPHER_KEY_ID || salt == INVALID_ENCRYPT_RANDOM_SALT) { throw encrypt_invalid_id(); } return std::make_pair(baseCipherKeyId, salt); @@ -164,9 +194,9 @@ Reference BlobCipherKeyIdCache::getLatestCipherKey() { if (!latestBaseCipherKeyId.present()) { return Reference(); } - ASSERT_NE(latestBaseCipherKeyId.get(), ENCRYPT_INVALID_CIPHER_KEY_ID); + ASSERT_NE(latestBaseCipherKeyId.get(), INVALID_ENCRYPT_CIPHER_KEY_ID); ASSERT(latestRandomSalt.present()); - ASSERT_NE(latestRandomSalt.get(), ENCRYPT_INVALID_RANDOM_SALT); + ASSERT_NE(latestRandomSalt.get(), INVALID_ENCRYPT_RANDOM_SALT); return getCipherByBaseCipherId(latestBaseCipherKeyId.get(), latestRandomSalt.get()); } @@ -185,7 +215,7 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt int baseCipherLen, const int64_t refreshAt, const int64_t expireAt) { - ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID); + ASSERT_GT(baseCipherId, INVALID_ENCRYPT_CIPHER_KEY_ID); // BaseCipherKeys are immutable, given the routine invocation updates 'latestCipher', // ensure no key-tampering is done @@ -193,7 +223,7 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt if (latestCipherKey.isValid() && latestCipherKey->getBaseCipherId() == baseCipherId) { if (memcmp(latestCipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { #if BLOB_CIPHER_DEBUG - TraceEvent(SevDebug, "InsertBaseCipherKey_AlreadyPresent") + TraceEvent(SevDebug, "InsertBaseCipherKeyAlreadyPresent") .detail("BaseCipherKeyId", baseCipherId) .detail("DomainId", domainId); #endif @@ -201,14 +231,14 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt // Key is already present; nothing more to do. return latestCipherKey; } else { - TraceEvent(SevInfo, "BlobCipher.UpdatetBaseCipherKey") + TraceEvent(SevInfo, "BlobCipherUpdatetBaseCipherKey") .detail("BaseCipherKeyId", baseCipherId) .detail("DomainId", domainId); throw encrypt_update_cipher(); } } - TraceEvent(SevInfo, "BlobCipherKey.InsertBaseCipherKeyLatest") + TraceEvent(SevInfo, "BlobCipherKeyInsertBaseCipherKeyLatest") .detail("DomainId", domainId) .detail("BaseCipherId", baseCipherId) .detail("RefreshAt", refreshAt) @@ -223,6 +253,7 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt latestBaseCipherKeyId = baseCipherId; latestRandomSalt = cipherKey->getSalt(); + (*sizeStat)++; return cipherKey; } @@ -232,8 +263,8 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt const EncryptCipherRandomSalt& salt, const int64_t refreshAt, const int64_t expireAt) { - ASSERT_NE(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID); - ASSERT_NE(salt, ENCRYPT_INVALID_RANDOM_SALT); + ASSERT_NE(baseCipherId, INVALID_ENCRYPT_CIPHER_KEY_ID); + ASSERT_NE(salt, INVALID_ENCRYPT_RANDOM_SALT); BlobCipherKeyIdCacheKey cacheKey = getCacheKey(baseCipherId, salt); @@ -242,7 +273,7 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt if (itr != keyIdCache.end()) { if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { #if BLOB_CIPHER_DEBUG - TraceEvent(SevDebug, "InsertBaseCipherKey_AlreadyPresent") + TraceEvent(SevDebug, "InsertBaseCipherKeyAlreadyPresent") .detail("BaseCipherKeyId", baseCipherId) .detail("DomainId", domainId); #endif @@ -250,14 +281,14 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt // Key is already present; nothing more to do. return itr->second; } else { - TraceEvent(SevInfo, "BlobCipher.UpdateBaseCipherKey") + TraceEvent(SevInfo, "BlobCipherUpdateBaseCipherKey") .detail("BaseCipherKeyId", baseCipherId) .detail("DomainId", domainId); throw encrypt_update_cipher(); } } - TraceEvent(SevInfo, "BlobCipherKey.InsertBaseCipherKey") + TraceEvent(SevInfo, "BlobCipherKeyInsertBaseCipherKey") .detail("DomainId", domainId) .detail("BaseCipherId", baseCipherId) .detail("Salt", salt) @@ -267,6 +298,7 @@ Reference BlobCipherKeyIdCache::insertBaseCipherKey(const Encrypt Reference cipherKey = makeReference(domainId, baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt); keyIdCache.emplace(cacheKey, cipherKey); + (*sizeStat)++; return cipherKey; } @@ -294,30 +326,31 @@ Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipher int baseCipherLen, const int64_t refreshAt, const int64_t expireAt) { - if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) { + if (domainId == INVALID_ENCRYPT_DOMAIN_ID || baseCipherId == INVALID_ENCRYPT_CIPHER_KEY_ID) { throw encrypt_invalid_id(); } + Reference cipherKey; + try { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { // Add mapping to track new encryption domain - Reference keyIdCache = makeReference(domainId); - Reference cipherKey = - keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); + Reference keyIdCache = makeReference(domainId, &size); + cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); domainCacheMap.emplace(domainId, keyIdCache); - return cipherKey; } else { // Track new baseCipher keys Reference keyIdCache = domainItr->second; - return keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); + cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); } } catch (Error& e) { - TraceEvent(SevWarn, "BlobCipher.InsertCipherKeyFailed") + TraceEvent(SevWarn, "BlobCipherInsertCipherKeyFailed") .detail("BaseCipherKeyId", baseCipherId) .detail("DomainId", domainId); throw; } + return cipherKey; } Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, @@ -327,8 +360,8 @@ Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipher const EncryptCipherRandomSalt& salt, const int64_t refreshAt, const int64_t expireAt) { - if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID || - salt == ENCRYPT_INVALID_RANDOM_SALT) { + if (domainId == INVALID_ENCRYPT_DOMAIN_ID || baseCipherId == INVALID_ENCRYPT_CIPHER_KEY_ID || + salt == INVALID_ENCRYPT_RANDOM_SALT) { throw encrypt_invalid_id(); } @@ -337,7 +370,7 @@ Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipher auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { // Add mapping to track new encryption domain - Reference keyIdCache = makeReference(domainId); + Reference keyIdCache = makeReference(domainId, &size); cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt); domainCacheMap.emplace(domainId, keyIdCache); @@ -348,24 +381,23 @@ Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipher keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt); } } catch (Error& e) { - TraceEvent(SevWarn, "BlobCipher.InsertCipherKey_Failed") + TraceEvent(SevWarn, "BlobCipherInsertCipherKey_Failed") .detail("BaseCipherKeyId", baseCipherId) .detail("DomainId", domainId) .detail("Salt", salt); throw; } - return cipherKey; } Reference BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) { - if (domainId == ENCRYPT_INVALID_DOMAIN_ID) { - TraceEvent(SevWarn, "BlobCipher.GetLatestCipherKeyInvalidID").detail("DomainId", domainId); + if (domainId == INVALID_ENCRYPT_DOMAIN_ID) { + TraceEvent(SevWarn, "BlobCipherGetLatestCipherKeyInvalidID").detail("DomainId", domainId); throw encrypt_invalid_id(); } auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { - TraceEvent(SevInfo, "BlobCipher.GetLatestCipherKeyDomainNotFound").detail("DomainId", domainId); + TraceEvent(SevInfo, "BlobCipherGetLatestCipherKeyDomainNotFound").detail("DomainId", domainId); return Reference(); } @@ -373,15 +405,20 @@ Reference BlobCipherKeyCache::getLatestCipherKey(const EncryptCip Reference cipherKey = keyIdCache->getLatestCipherKey(); // Ensure 'freshness' guarantees for the latestCipher - if (cipherKey.isValid() && cipherKey->needsRefresh()) { + if (cipherKey.isValid()) { + if (cipherKey->needsRefresh()) { #if BLOB_CIPHER_DEBUG - TraceEvent("SevDebug, BlobCipher.GetLatestNeedsRefresh") - .detail("DomainId", domainId) - .detail("Now", now()) - .detail("RefreshAt", cipherKey->getRefreshAtTS()); + TraceEvent("SevDebug, BlobCipherGetLatestNeedsRefresh") + .detail("DomainId", domainId) + .detail("Now", now()) + .detail("RefreshAt", cipherKey->getRefreshAtTS()); #endif - - return Reference(); + ++BlobCipherMetrics::getInstance()->latestCipherKeyCacheNeedsRefresh; + return Reference(); + } + ++BlobCipherMetrics::getInstance()->latestCipherKeyCacheHit; + } else { + ++BlobCipherMetrics::getInstance()->latestCipherKeyCacheMiss; } return cipherKey; @@ -399,16 +436,21 @@ Reference BlobCipherKeyCache::getCipherKey(const EncryptCipherDom Reference cipherKey = keyIdCache->getCipherByBaseCipherId(baseCipherId, salt); // Ensure 'liveness' guarantees for the cipher - if (cipherKey.isValid() && cipherKey->isExpired()) { + if (cipherKey.isValid()) { + if (cipherKey->isExpired()) { #if BLOB_CIPHER_DEBUG - TraceEvent(SevDebug, "BlobCipher.GetCipherExpired") - .detail("DomainId", domainId) - .detail("BaseCipherId", baseCipherId) - .detail("Now", now()) - .detail("ExpireAt", cipherKey->getExpireAtTS()); + TraceEvent(SevDebug, "BlobCipherGetCipherExpired") + .detail("DomainId", domainId) + .detail("BaseCipherId", baseCipherId) + .detail("Now", now()) + .detail("ExpireAt", cipherKey->getExpireAtTS()); #endif - - return Reference(); + ++BlobCipherMetrics::getInstance()->cipherKeyCacheExpired; + return Reference(); + } + ++BlobCipherMetrics::getInstance()->cipherKeyCacheHit; + } else { + ++BlobCipherMetrics::getInstance()->cipherKeyCacheMiss; } return cipherKey; @@ -421,22 +463,25 @@ void BlobCipherKeyCache::resetEncryptDomainId(const EncryptCipherDomainId domain } Reference keyIdCache = domainItr->second; + ASSERT(keyIdCache->getSize() <= size); + size -= keyIdCache->getSize(); keyIdCache->cleanup(); - TraceEvent(SevInfo, "BlobCipher.ResetEncryptDomainId").detail("DomainId", domainId); + TraceEvent(SevInfo, "BlobCipherResetEncryptDomainId").detail("DomainId", domainId); } void BlobCipherKeyCache::cleanup() noexcept { Reference instance = BlobCipherKeyCache::getInstance(); - TraceEvent(SevInfo, "BlobCipherKeyCache.Cleanup").log(); + TraceEvent(SevInfo, "BlobCipherKeyCacheCleanup").log(); for (auto& domainItr : instance->domainCacheMap) { Reference keyIdCache = domainItr.second; keyIdCache->cleanup(); - TraceEvent(SevInfo, "BlobCipher.KeyCacheCleanup").detail("DomainId", domainItr.first); + TraceEvent(SevInfo, "BlobCipherKeyCacheCleanup").detail("DomainId", domainItr.first); } instance->domainCacheMap.clear(); + instance->size = 0; } std::vector> BlobCipherKeyCache::getAllCiphers(const EncryptCipherDomainId& domainId) { @@ -455,9 +500,25 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference Reference hCipherKey, const uint8_t* cipherIV, const int ivLen, - const EncryptAuthTokenMode mode) - : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode) { - ASSERT(isEncryptHeaderAuthTokenModeValid(mode)); + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + usageType(usageType) { + ASSERT_EQ(ivLen, AES_256_IV_LENGTH); + authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); + memcpy(&iv[0], cipherIV, ivLen); + init(); +} + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* cipherIV, + const int ivLen, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + authTokenAlgo(algo), usageType(usageType) { ASSERT_EQ(ivLen, AES_256_IV_LENGTH); memcpy(&iv[0], cipherIV, ivLen); init(); @@ -465,14 +526,37 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, Reference hCipherKey, - const EncryptAuthTokenMode mode) - : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode) { - ASSERT(isEncryptHeaderAuthTokenModeValid(mode)); + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + usageType(usageType) { + authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); + deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); + init(); +} + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + usageType(usageType) { deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); init(); } void EncryptBlobCipherAes265Ctr::init() { + ASSERT(textCipherKey.isValid()); + ASSERT(headerCipherKey.isValid()); + + if (!isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)) { + TraceEvent(SevWarn, "InvalidAuthTokenDetails") + .detail("TokenMode", authTokenMode) + .detail("TokenAlgo", authTokenAlgo); + throw internal_error(); + } + if (ctx == nullptr) { throw encrypt_ops_error(); } @@ -488,21 +572,23 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte const int plaintextLen, BlobCipherEncryptHeader* header, Arena& arena) { - CODE_PROBE(true, "Encrypting data with BlobCipher"); + double startTime = 0.0; + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + startTime = timer_monotonic(); + } memset(reinterpret_cast(header), 0, sizeof(BlobCipherEncryptHeader)); // Alloc buffer computation accounts for 'header authentication' generation scheme. If single-auth-token needs // to be generated, allocate buffer sufficient to append header to the cipherText optimizing memcpy cost. - const int allocSize = authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE - ? plaintextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader) - : plaintextLen + AES_BLOCK_SIZE; + const int allocSize = plaintextLen + AES_BLOCK_SIZE; Reference encryptBuf = makeReference(allocSize, arena); uint8_t* ciphertext = encryptBuf->begin(); + int bytes{ 0 }; if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) { - TraceEvent(SevWarn, "BlobCipher.EncryptUpdateFailed") + TraceEvent(SevWarn, "BlobCipherEncryptUpdateFailed") .detail("BaseCipherId", textCipherKey->getBaseCipherId()) .detail("EncryptDomainId", textCipherKey->getDomainId()); throw encrypt_ops_error(); @@ -510,14 +596,14 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte int finalBytes{ 0 }; if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) { - TraceEvent(SevWarn, "BlobCipher.EncryptFinalFailed") + TraceEvent(SevWarn, "BlobCipherEncryptFinalFailed") .detail("BaseCipherId", textCipherKey->getBaseCipherId()) .detail("EncryptDomainId", textCipherKey->getDomainId()); throw encrypt_ops_error(); } if ((bytes + finalBytes) != plaintextLen) { - TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedCipherLen") + TraceEvent(SevWarn, "BlobCipherEncryptUnexpectedCipherLen") .detail("PlaintextLen", plaintextLen) .detail("EncryptedBufLen", bytes + finalBytes); throw encrypt_ops_error(); @@ -528,86 +614,75 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION; header->flags.encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR; header->flags.authTokenMode = authTokenMode; + header->flags.authTokenAlgo = authTokenAlgo; + + // Ensure encryption header authToken details sanity + ASSERT(isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)); // Populate cipherText encryption-key details header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId(); header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId(); header->cipherTextDetails.salt = textCipherKey->getSalt(); + // Populate header encryption-key details + // TODO: HeaderCipherKey is not necessary if AuthTokenMode == NONE + header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId(); + header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId(); + header->cipherHeaderDetails.salt = headerCipherKey->getSalt(); + memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH); - if (authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + if (authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { // No header 'authToken' generation needed. } else { - // Populate header encryption-key details - header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId(); - header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId(); - header->cipherHeaderDetails.salt = headerCipherKey->getSalt(); // Populate header authToken details - if (header->flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { - ASSERT_GE(allocSize, (bytes + finalBytes + sizeof(BlobCipherEncryptHeader))); - ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes + sizeof(BlobCipherEncryptHeader))); + if (header->flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { + ASSERT_GE(allocSize, (bytes + finalBytes)); + ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes)); - memcpy(&ciphertext[bytes + finalBytes], - reinterpret_cast(header), - sizeof(BlobCipherEncryptHeader)); - StringRef authToken = computeAuthToken(ciphertext, - bytes + finalBytes + sizeof(BlobCipherEncryptHeader), - headerCipherKey->rawCipher(), - AES_256_KEY_LENGTH, - arena); - memcpy(&header->singleAuthToken.authToken[0], authToken.begin(), AUTH_TOKEN_SIZE); + computeAuthToken({ { ciphertext, bytes + finalBytes }, + { reinterpret_cast(header), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &header->singleAuthToken.authToken[0], + (EncryptAuthTokenAlgo)header->flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); } else { - ASSERT_EQ(header->flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT_EQ(header->flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); - StringRef cipherTextAuthToken = - computeAuthToken(ciphertext, - bytes + finalBytes, - reinterpret_cast(&header->cipherTextDetails.salt), - sizeof(EncryptCipherRandomSalt), - arena); - memcpy(&header->multiAuthTokens.cipherTextAuthToken[0], cipherTextAuthToken.begin(), AUTH_TOKEN_SIZE); - StringRef headerAuthToken = computeAuthToken(reinterpret_cast(header), - sizeof(BlobCipherEncryptHeader), - headerCipherKey->rawCipher(), - AES_256_KEY_LENGTH, - arena); - memcpy(&header->multiAuthTokens.headerAuthToken[0], headerAuthToken.begin(), AUTH_TOKEN_SIZE); + // TOOD: Use HMAC_SHA encyrption authentication scheme as AES_CMAC needs minimum 16 bytes cipher key + computeAuthToken({ { ciphertext, bytes + finalBytes } }, + reinterpret_cast(&header->cipherTextDetails.salt), + sizeof(EncryptCipherRandomSalt), + &header->multiAuthTokens.cipherTextAuthToken[0], + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + AUTH_TOKEN_MAX_SIZE); + computeAuthToken({ { reinterpret_cast(header), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &header->multiAuthTokens.headerAuthToken[0], + (EncryptAuthTokenAlgo)header->flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); } } encryptBuf->setLogicalSize(plaintextLen); + + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + BlobCipherMetrics::counters(usageType).encryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9); + } + + CODE_PROBE(true, "BlobCipher data encryption"); + CODE_PROBE(header->flags.authTokenAlgo == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, + "Encryption authentication disabled"); + CODE_PROBE(header->flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + "HMAC_SHA Auth token generation"); + CODE_PROBE(header->flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + "AES_CMAC Auth token generation"); + return encryptBuf; } -Standalone EncryptBlobCipherAes265Ctr::encryptBlobGranuleChunk(const uint8_t* plaintext, - const int plaintextLen) { - Standalone encrypted = makeString(plaintextLen); - uint8_t* ciphertext = mutateString(encrypted); - int bytes{ 0 }; - - if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) { - TraceEvent(SevWarn, "BlobCipher.EncryptUpdateFailed") - .detail("BaseCipherId", textCipherKey->getBaseCipherId()) - .detail("EncryptDomainId", textCipherKey->getDomainId()); - throw encrypt_ops_error(); - } - int finalBytes{ 0 }; - if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) { - TraceEvent(SevWarn, "BlobCipher.EncryptFinalFailed") - .detail("BaseCipherId", textCipherKey->getBaseCipherId()) - .detail("EncryptDomainId", textCipherKey->getDomainId()); - throw encrypt_ops_error(); - } - if ((bytes + finalBytes) != plaintextLen) { - TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedCipherLen") - .detail("PlaintextLen", plaintextLen) - .detail("EncryptedBufLen", bytes + finalBytes); - throw encrypt_ops_error(); - } - return encrypted; -} - EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() { if (ctx != nullptr) { EVP_CIPHER_CTX_free(ctx); @@ -618,9 +693,10 @@ EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() { DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference tCipherKey, Reference hCipherKey, - const uint8_t* iv) + const uint8_t* iv, + BlobCipherMetrics::UsageType usageType) : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), - headerAuthTokenValidationDone(false), authTokensValidationDone(false) { + headerAuthTokenValidationDone(false), authTokensValidationDone(false), usageType(usageType) { if (ctx == nullptr) { throw encrypt_ops_error(); } @@ -640,24 +716,30 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHe } ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT(isEncryptHeaderAuthTokenAlgoValid((EncryptAuthTokenAlgo)header.flags.authTokenAlgo)); BlobCipherEncryptHeader headerCopy; memcpy(reinterpret_cast(&headerCopy), reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); - memset(reinterpret_cast(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_SIZE); - StringRef computedHeaderAuthToken = computeAuthToken(reinterpret_cast(&headerCopy), - sizeof(BlobCipherEncryptHeader), - headerCipherKey->rawCipher(), - AES_256_KEY_LENGTH, - arena); - if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) { - TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch") + memset(reinterpret_cast(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_MAX_SIZE); + uint8_t computedHeaderAuthToken[AUTH_TOKEN_MAX_SIZE]{}; + computeAuthToken({ { reinterpret_cast(&headerCopy), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &computedHeaderAuthToken[0], + (EncryptAuthTokenAlgo)header.flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); + + int authTokenSize = getEncryptHeaderAuthTokenSize(header.flags.authTokenAlgo); + ASSERT_LE(authTokenSize, AUTH_TOKEN_MAX_SIZE); + if (memcmp(&header.multiAuthTokens.headerAuthToken[0], &computedHeaderAuthToken[0], authTokenSize) != 0) { + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderMode", header.flags.encryptMode) .detail("MultiAuthHeaderAuthToken", - StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_SIZE).toString()) - .detail("ComputedHeaderAuthToken", computedHeaderAuthToken.toString()); + StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("ComputedHeaderAuthToken", StringRef(computedHeaderAuthToken, AUTH_TOKEN_MAX_SIZE)); throw encrypt_header_authtoken_mismatch(); } @@ -667,27 +749,35 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHe void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena) { // Header authToken not set for single auth-token mode. ASSERT(!headerAuthTokenValidationDone); // prepare the payload {cipherText + encryptionHeader} - memcpy(&buff[0], ciphertext, ciphertextLen); - memcpy(&buff[ciphertextLen], reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); // ensure the 'authToken' is reset before computing the 'authentication token' - BlobCipherEncryptHeader* eHeader = (BlobCipherEncryptHeader*)(&buff[ciphertextLen]); - memset(reinterpret_cast(&eHeader->singleAuthToken), 0, 2 * AUTH_TOKEN_SIZE); + BlobCipherEncryptHeader headerCopy; + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + memset(reinterpret_cast(&headerCopy.singleAuthToken), 0, 2 * AUTH_TOKEN_MAX_SIZE); + uint8_t computed[AUTH_TOKEN_MAX_SIZE]; + computeAuthToken({ { ciphertext, ciphertextLen }, + { reinterpret_cast(&headerCopy), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &computed[0], + (EncryptAuthTokenAlgo)header.flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); - StringRef computed = computeAuthToken( - buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena); - if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) { - TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch") + int authTokenSize = getEncryptHeaderAuthTokenSize(header.flags.authTokenAlgo); + ASSERT_LE(authTokenSize, AUTH_TOKEN_MAX_SIZE); + if (memcmp(&header.singleAuthToken.authToken[0], &computed[0], authTokenSize) != 0) { + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderMode", header.flags.encryptMode) .detail("SingleAuthToken", - StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString()) - .detail("ComputedSingleAuthToken", computed.toString()); + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("ComputedSingleAuthToken", StringRef(computed, AUTH_TOKEN_MAX_SIZE)); throw encrypt_header_authtoken_mismatch(); } } @@ -695,25 +785,26 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena) { if (!headerAuthTokenValidationDone) { verifyHeaderAuthToken(header, arena); } - StringRef computedCipherTextAuthToken = - computeAuthToken(ciphertext, - ciphertextLen, - reinterpret_cast(&header.cipherTextDetails.salt), - sizeof(EncryptCipherRandomSalt), - arena); - if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) != + uint8_t computedCipherTextAuthToken[AUTH_TOKEN_MAX_SIZE]; + // TOOD: Use HMAC_SHA encyrption authentication scheme as AES_CMAC needs minimum 16 bytes cipher key + computeAuthToken({ { ciphertext, ciphertextLen } }, + reinterpret_cast(&header.cipherTextDetails.salt), + sizeof(EncryptCipherRandomSalt), + &computedCipherTextAuthToken[0], + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + AUTH_TOKEN_MAX_SIZE); + if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], &computedCipherTextAuthToken[0], AUTH_TOKEN_MAX_SIZE) != 0) { - TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeaderAuthTokenMismatch") + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderMode", header.flags.encryptMode) .detail("MultiAuthCipherTextAuthToken", - StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_SIZE).toString()) - .detail("ComputedCipherTextAuthToken", computedCipherTextAuthToken.toString()); + StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("ComputedCipherTextAuthToken", StringRef(computedCipherTextAuthToken, AUTH_TOKEN_MAX_SIZE)); throw encrypt_header_authtoken_mismatch(); } } @@ -721,13 +812,12 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphe void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena) { - if (header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { - verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, buff, arena); + if (header.flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { + verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, arena); } else { ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); - verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, buff, arena); + verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, arena); } authTokensValidationDone = true; @@ -736,13 +826,13 @@ void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext, void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header) { // validate header flag sanity if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION || - header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR || + header.flags.encryptMode != EncryptCipherMode::ENCRYPT_CIPHER_MODE_AES_256_CTR || !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) { - TraceEvent(SevWarn, "BlobCipher.VerifyEncryptBlobHeader") + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeader") .detail("HeaderVersion", header.flags.headerVersion) .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION) .detail("EncryptCipherMode", header.flags.encryptMode) - .detail("ExpectedCipherMode", ENCRYPT_CIPHER_MODE_AES_256_CTR) + .detail("ExpectedCipherMode", EncryptCipherMode::ENCRYPT_CIPHER_MODE_AES_256_CTR) .detail("EncryptHeaderAuthTokenMode", header.flags.authTokenMode); throw encrypt_header_metadata_mismatch(); } @@ -752,30 +842,32 @@ Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert const int ciphertextLen, const BlobCipherEncryptHeader& header, Arena& arena) { - CODE_PROBE(true, "Decrypting data with BlobCipher"); + double startTime = 0.0; + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + startTime = timer_monotonic(); + } verifyEncryptHeaderMetadata(header); - if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) { - TraceEvent(SevWarn, "BlobCipher.DecryptInvalidHeaderCipherKey") + if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && + !headerCipherKey.isValid()) { + TraceEvent(SevWarn, "BlobCipherDecryptInvalidHeaderCipherKey") .detail("AuthTokenMode", header.flags.authTokenMode); throw encrypt_ops_error(); } - const int allocSize = header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE - ? ciphertextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader) - : ciphertextLen + AES_BLOCK_SIZE; + const int allocSize = ciphertextLen + AES_BLOCK_SIZE; Reference decrypted = makeReference(allocSize, arena); - if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - verifyAuthTokens(ciphertext, ciphertextLen, header, decrypted->begin(), arena); + if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + verifyAuthTokens(ciphertext, ciphertextLen, header, arena); ASSERT(authTokensValidationDone); } uint8_t* plaintext = decrypted->begin(); int bytesDecrypted{ 0 }; if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) { - TraceEvent(SevWarn, "BlobCipher.DecryptUpdateFailed") + TraceEvent(SevWarn, "BlobCipherDecryptUpdateFailed") .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId); throw encrypt_ops_error(); @@ -783,20 +875,33 @@ Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert int finalBlobBytes{ 0 }; if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) { - TraceEvent(SevWarn, "BlobCipher.DecryptFinalFailed") + TraceEvent(SevWarn, "BlobCipherDecryptFinalFailed") .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId); throw encrypt_ops_error(); } if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) { - TraceEvent(SevWarn, "BlobCipher.EncryptUnexpectedPlaintextLen") + TraceEvent(SevWarn, "BlobCipherEncryptUnexpectedPlaintextLen") .detail("CiphertextLen", ciphertextLen) .detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes); throw encrypt_ops_error(); } decrypted->setLogicalSize(ciphertextLen); + + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + BlobCipherMetrics::counters(usageType).decryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9); + } + + CODE_PROBE(true, "BlobCipher data decryption"); + CODE_PROBE(header.flags.authTokenAlgo == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, + "Decryption authentication disabled"); + CODE_PROBE(header.flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + "Decryption HMAC_SHA Auth token verification"); + CODE_PROBE(header.flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + "Decryption AES_CMAC Auth token verification"); + return decrypted; } @@ -820,32 +925,100 @@ HmacSha256DigestGen::~HmacSha256DigestGen() { } } -StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Arena& arena) { - CODE_PROBE(true, "Digest generation"); - unsigned int digestLen = HMAC_size(ctx); - auto digest = new (arena) unsigned char[digestLen]; - if (HMAC_Update(ctx, data, len) != 1) { +unsigned int HmacSha256DigestGen::digest(const std::vector>& payload, + unsigned char* buf, + unsigned int bufLen) { + ASSERT_EQ(bufLen, HMAC_size(ctx)); + + for (const auto& p : payload) { + if (HMAC_Update(ctx, p.first, p.second) != 1) { + throw encrypt_ops_error(); + } + } + + unsigned int digestLen = 0; + if (HMAC_Final(ctx, buf, &digestLen) != 1) { throw encrypt_ops_error(); } - if (HMAC_Final(ctx, digest, &digestLen) != 1) { - throw encrypt_ops_error(); - } + CODE_PROBE(true, "HMAC_SHA Digest generation"); - return StringRef(arena, digest, digestLen); + return digestLen; } -StringRef computeAuthToken(const uint8_t* payload, - const int payloadLen, - const uint8_t* key, - const int keyLen, - Arena& arena) { - CODE_PROBE(true, "Auth token generation"); - HmacSha256DigestGen hmacGenerator(key, keyLen); - StringRef digest = hmacGenerator.digest(payload, payloadLen, arena); +// Aes256CtrCmacDigestGen methods +Aes256CmacDigestGen::Aes256CmacDigestGen(const unsigned char* key, size_t keylen) : ctx(CMAC_CTX_new()) { + ASSERT_EQ(keylen, AES_256_KEY_LENGTH); - ASSERT_GE(digest.size(), AUTH_TOKEN_SIZE); - return digest; + if (ctx == nullptr) { + throw encrypt_ops_error(); + } + if (!CMAC_Init(ctx, key, keylen, EVP_aes_256_cbc(), NULL)) { + throw encrypt_ops_error(); + } +} + +size_t Aes256CmacDigestGen::digest(const std::vector>& payload, + uint8_t* digest, + int digestlen) { + ASSERT(ctx != nullptr); + ASSERT_GE(digestlen, AUTH_TOKEN_AES_CMAC_SIZE); + + for (const auto& p : payload) { + if (!CMAC_Update(ctx, p.first, p.second)) { + throw encrypt_ops_error(); + } + } + size_t ret; + if (!CMAC_Final(ctx, digest, &ret)) { + throw encrypt_ops_error(); + } + + return ret; +} + +Aes256CmacDigestGen::~Aes256CmacDigestGen() { + if (ctx != nullptr) { + CMAC_CTX_free(ctx); + } +} + +void computeAuthToken(const std::vector>& payload, + const uint8_t* key, + const int keyLen, + unsigned char* digestBuf, + const EncryptAuthTokenAlgo algo, + unsigned int digestBufMaxSz) { + ASSERT_EQ(digestBufMaxSz, AUTH_TOKEN_MAX_SIZE); + ASSERT(isEncryptHeaderAuthTokenAlgoValid(algo)); + + int authTokenSz = getEncryptHeaderAuthTokenSize(algo); + ASSERT_LE(authTokenSz, AUTH_TOKEN_MAX_SIZE); + + if (algo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { + ASSERT_EQ(authTokenSz, AUTH_TOKEN_HMAC_SHA_SIZE); + + HmacSha256DigestGen hmacGenerator(key, keyLen); + unsigned int digestLen = hmacGenerator.digest(payload, digestBuf, authTokenSz); + + ASSERT_EQ(digestLen, authTokenSz); + } else if (algo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC) { + ASSERT_EQ(authTokenSz, AUTH_TOKEN_AES_CMAC_SIZE); + ASSERT_EQ(keyLen, AES_256_KEY_LENGTH); + + Aes256CmacDigestGen cmacGenerator(key, keyLen); + size_t digestLen = cmacGenerator.digest(payload, digestBuf, authTokenSz); + + ASSERT_EQ(digestLen, authTokenSz); + } else { + throw not_implemented(); + } +} + +EncryptAuthTokenMode getEncryptAuthTokenMode(const EncryptAuthTokenMode mode) { + // Override mode if authToken isn't enabled + return FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED ? mode + : EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE; } // Only used to link unit tests @@ -863,7 +1036,7 @@ void forceLinkBlobCipherTests() {} // 6.1 cleanup cipherKeys by given encryptDomainId // 6.2. Cleanup all cached cipherKeys TEST_CASE("flow/BlobCipher") { - TraceEvent("BlobCipherTest.Start").log(); + TraceEvent("BlobCipherTestStart").log(); // Construct a dummy External Key Manager representation and populate with some keys class BaseCipher : public ReferenceCounted, NonCopyable { @@ -907,19 +1080,19 @@ TEST_CASE("flow/BlobCipher") { Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); // validate getLatestCipherKey return empty when there's no cipher key - TraceEvent("BlobCipherTest.LatestKeyNotExists").log(); + TraceEvent("BlobCipherTestLatestKeyNotExists").log(); Reference latestKeyNonexists = cipherKeyCache->getLatestCipherKey(deterministicRandom()->randomInt(minDomainId, maxDomainId)); ASSERT(!latestKeyNonexists.isValid()); try { - cipherKeyCache->getLatestCipherKey(ENCRYPT_INVALID_DOMAIN_ID); + cipherKeyCache->getLatestCipherKey(INVALID_ENCRYPT_DOMAIN_ID); ASSERT(false); // shouldn't get here } catch (Error& e) { ASSERT_EQ(e.code(), error_code_encrypt_invalid_id); } // insert BlobCipher keys into BlobCipherKeyCache map and validate - TraceEvent("BlobCipherTest_InsertKeys").log(); + TraceEvent("BlobCipherTestInsertKeys").log(); for (auto& domainItr : domainKeyMap) { for (auto& baseKeyItr : domainItr.second) { Reference baseCipher = baseKeyItr.second; @@ -944,7 +1117,7 @@ TEST_CASE("flow/BlobCipher") { headerBaseCipher->refreshAt, headerBaseCipher->expireAt); - TraceEvent("BlobCipherTest.InsertKeysDone").log(); + TraceEvent("BlobCipherTestInsertKeysDone").log(); // validate the cipherKey lookups work as desired for (auto& domainItr : domainKeyMap) { @@ -963,7 +1136,7 @@ TEST_CASE("flow/BlobCipher") { ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0); } } - TraceEvent("BlobCipherTest.LooksupDone").log(); + TraceEvent("BlobCipherTestLooksupDone").log(); // Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP try { @@ -977,7 +1150,7 @@ TEST_CASE("flow/BlobCipher") { } catch (Error& e) { throw; } - TraceEvent("BlobCipherTest.ReinsertIdempotentKeyDone").log(); + TraceEvent("BlobCipherTestReinsertIdempotentKeyDone").log(); // Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error try { @@ -999,7 +1172,7 @@ TEST_CASE("flow/BlobCipher") { throw; } } - TraceEvent("BlobCipherTest.ReinsertNonIdempotentKeyDone").log(); + TraceEvent("BlobCipherTestReinsertNonIdempotentKeyDone").log(); // Validate Encryption ops Reference cipherKey = cipherKeyCache->getLatestCipherKey(minDomainId); @@ -1015,22 +1188,28 @@ TEST_CASE("flow/BlobCipher") { BlobCipherEncryptHeader headerCopy; // validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE { - TraceEvent("NoneAuthMode.Start").log(); + TraceEvent("NoneAuthModeStart"); - EncryptBlobCipherAes265Ctr encryptor( - cipherKey, Reference(), iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, + BlobCipherMetrics::TEST); BlobCipherEncryptHeader header; Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); ASSERT_EQ(encrypted->getLogicalSize(), bufLen); ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); - ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); - ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); + ASSERT_EQ(header.flags.encryptMode, EncryptCipherMode::ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); - TraceEvent("BlobCipherTest.EncryptDone") + TraceEvent("BlobCipherTestEncryptDone") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) .detail("DomainId", header.cipherTextDetails.encryptDomainId) .detail("BaseCipherId", header.cipherTextDetails.baseCipherId); @@ -1038,13 +1217,14 @@ TEST_CASE("flow/BlobCipher") { header.cipherTextDetails.baseCipherId, header.cipherTextDetails.salt); ASSERT(tCipherKeyKey->isEqual(cipherKey)); - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, Reference(), &header.iv[0]); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), &header.iv[0], BlobCipherMetrics::TEST); Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); ASSERT_EQ(decrypted->getLogicalSize(), bufLen); ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); - TraceEvent("BlobCipherTest.DecryptDone").log(); + TraceEvent("BlobCipherTestDecryptDone"); // induce encryption header corruption - headerVersion corrupted memcpy(reinterpret_cast(&headerCopy), @@ -1053,7 +1233,8 @@ TEST_CASE("flow/BlobCipher") { headerCopy.flags.headerVersion += 1; try { encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, Reference(), header.iv); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1069,7 +1250,8 @@ TEST_CASE("flow/BlobCipher") { headerCopy.flags.encryptMode += 1; try { encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, Reference(), header.iv); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1085,22 +1267,29 @@ TEST_CASE("flow/BlobCipher") { memcpy(encrypted->begin(), &temp[0], bufLen); int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); temp[tIdx] += 1; - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, Reference(), header.iv); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); } catch (Error& e) { // No authToken, hence, no corruption detection supported ASSERT(false); } - TraceEvent("NoneAuthMode.Done").log(); + TraceEvent("NoneAuthModeDone"); } // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE + // HMAC_SHA authToken algorithm { - TraceEvent("SingleAuthMode.Start").log(); + TraceEvent("SingleAuthModeHmacShaStart").log(); - EncryptBlobCipherAes265Ctr encryptor( - cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + BlobCipherMetrics::TEST); BlobCipherEncryptHeader header; Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); @@ -1108,15 +1297,18 @@ TEST_CASE("flow/BlobCipher") { ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); - ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA); - TraceEvent("BlobCipherTest.EncryptDone") + TraceEvent("BlobCipherTestEncryptDone") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) .detail("DomainId", header.cipherTextDetails.encryptDomainId) .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) .detail("HeaderAuthToken", - StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString()); + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_HMAC_SHA_SIZE).toString()); Reference tCipherKeyKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, header.cipherTextDetails.baseCipherId, @@ -1125,13 +1317,13 @@ TEST_CASE("flow/BlobCipher") { header.cipherHeaderDetails.baseCipherId, header.cipherHeaderDetails.salt); ASSERT(tCipherKeyKey->isEqual(cipherKey)); - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); ASSERT_EQ(decrypted->getLogicalSize(), bufLen); ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); - TraceEvent("BlobCipherTest.DecryptDone").log(); + TraceEvent("BlobCipherTestDecryptDone"); // induce encryption header corruption - headerVersion corrupted encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); @@ -1140,7 +1332,7 @@ TEST_CASE("flow/BlobCipher") { sizeof(BlobCipherEncryptHeader)); headerCopy.flags.headerVersion += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1156,7 +1348,7 @@ TEST_CASE("flow/BlobCipher") { sizeof(BlobCipherEncryptHeader)); headerCopy.flags.encryptMode += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1170,10 +1362,10 @@ TEST_CASE("flow/BlobCipher") { memcpy(reinterpret_cast(&headerCopy), reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); - int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1); headerCopy.singleAuthToken.authToken[hIdx] += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1189,7 +1381,7 @@ TEST_CASE("flow/BlobCipher") { memcpy(encrypted->begin(), &temp[0], bufLen); int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); temp[tIdx] += 1; - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); } catch (Error& e) { if (e.code() != error_code_encrypt_header_authtoken_mismatch) { @@ -1197,15 +1389,19 @@ TEST_CASE("flow/BlobCipher") { } } - TraceEvent("SingleAuthMode.Done").log(); + TraceEvent("SingleAuthModeHmacShaDone"); } - - // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI + // AES_CMAC authToken algorithm { - TraceEvent("MultiAuthMode.Start").log(); + TraceEvent("SingleAuthModeAesCMacStart").log(); - EncryptBlobCipherAes265Ctr encryptor( - cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + BlobCipherMetrics::TEST); BlobCipherEncryptHeader header; Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); @@ -1213,31 +1409,33 @@ TEST_CASE("flow/BlobCipher") { ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); - ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); - TraceEvent("BlobCipherTest.EncryptDone") + TraceEvent("BlobCipherTestEncryptDone") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) .detail("DomainId", header.cipherTextDetails.encryptDomainId) .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) .detail("HeaderAuthToken", - StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString()); + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_AES_CMAC_SIZE).toString()); - Reference tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, - header.cipherTextDetails.baseCipherId, - header.cipherTextDetails.salt); + Reference tCipherKeyKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, header.cipherHeaderDetails.baseCipherId, header.cipherHeaderDetails.salt); - - ASSERT(tCipherKey->isEqual(cipherKey)); - DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv); + ASSERT(tCipherKeyKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); ASSERT_EQ(decrypted->getLogicalSize(), bufLen); ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); - TraceEvent("BlobCipherTest.DecryptDone").log(); + TraceEvent("BlobCipherTestDecryptDone").log(); // induce encryption header corruption - headerVersion corrupted encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); @@ -1246,7 +1444,7 @@ TEST_CASE("flow/BlobCipher") { sizeof(BlobCipherEncryptHeader)); headerCopy.flags.headerVersion += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1262,7 +1460,122 @@ TEST_CASE("flow/BlobCipher") { sizeof(BlobCipherEncryptHeader)); headerCopy.flags.encryptMode += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1); + headerCopy.singleAuthToken.authToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encrypted buffer payload corruption + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("SingleAuthModeAesCmacDone"); + } + + // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI + // HMAC_SHA authToken algorithm + { + TraceEvent("MultiAuthModeHmacShaStart").log(); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_HMAC_SHA_SIZE).toString()); + + Reference tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + + ASSERT(tCipherKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1276,10 +1589,10 @@ TEST_CASE("flow/BlobCipher") { memcpy(reinterpret_cast(&headerCopy), reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); - int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1); headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1293,10 +1606,10 @@ TEST_CASE("flow/BlobCipher") { memcpy(reinterpret_cast(&headerCopy), reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); - hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1); + hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1); headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1; try { - DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); ASSERT(false); // error expected } catch (Error& e) { @@ -1311,7 +1624,7 @@ TEST_CASE("flow/BlobCipher") { memcpy(encrypted->begin(), &temp[0], bufLen); int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); temp[tIdx] += 1; - DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); } catch (Error& e) { if (e.code() != error_code_encrypt_header_authtoken_mismatch) { @@ -1319,7 +1632,136 @@ TEST_CASE("flow/BlobCipher") { } } - TraceEvent("MultiAuthMode.Done").log(); + TraceEvent("MultiAuthModeHmacShaDone"); + } + // AES_CMAC authToken algorithm + { + TraceEvent("MultiAuthModeAesCmacStart"); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_AES_CMAC_SIZE).toString()); + + Reference tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + + ASSERT(tCipherKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - cipherText authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1); + headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encryption header corruption - header authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1); + headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("MultiAuthModeAesCmacDone"); } // Validate dropping encryptDomainId cached keys @@ -1335,6 +1777,6 @@ TEST_CASE("flow/BlobCipher") { ASSERT(cachedKeys.empty()); } - TraceEvent("BlobCipherTest.Done").log(); + TraceEvent("BlobCipherTestDone"); return Void(); } diff --git a/fdbclient/BlobGranuleCommon.cpp b/fdbclient/BlobGranuleCommon.cpp new file mode 100644 index 0000000000..44f32bcb25 --- /dev/null +++ b/fdbclient/BlobGranuleCommon.cpp @@ -0,0 +1,45 @@ +/* + * BlobGranuleCommon.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/BlobGranuleCommon.h" + +BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk) { + BlobGranuleSummaryRef summary; + ASSERT(chunk.snapshotFile.present()); + ASSERT(chunk.snapshotVersion != invalidVersion); + ASSERT(chunk.includedVersion >= chunk.snapshotVersion); + ASSERT(chunk.newDeltas.empty()); + + if (chunk.tenantPrefix.present()) { + summary.keyRange = KeyRangeRef(ar, chunk.keyRange.removePrefix(chunk.tenantPrefix.get())); + } else { + summary.keyRange = KeyRangeRef(ar, chunk.keyRange); + } + + summary.snapshotVersion = chunk.snapshotVersion; + summary.snapshotSize = chunk.snapshotFile.get().length; + summary.deltaVersion = chunk.includedVersion; + summary.deltaSize = 0; + for (auto& it : chunk.deltaFiles) { + summary.deltaSize += it.length; + } + + return summary; +} \ No newline at end of file diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index 0850b67974..3747824437 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -20,6 +20,7 @@ #include "fdbclient/BlobGranuleFiles.h" +#include "fdbclient/BlobCipher.h" #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/ClientKnobs.h" #include "fdbclient/CommitTransaction.h" @@ -27,10 +28,11 @@ #include "fdbclient/SystemData.h" // for allKeys unit test - could remove #include "flow/Arena.h" -#include "flow/BlobCipher.h" #include "flow/CompressionUtils.h" #include "flow/DeterministicRandom.h" +#include "flow/EncryptUtils.h" #include "flow/IRandom.h" +#include "flow/Knobs.h" #include "flow/Trace.h" #include "flow/serialize.h" #include "flow/UnitTest.h" @@ -60,21 +62,6 @@ uint16_t MIN_SUPPORTED_BG_FORMAT_VERSION = 1; const uint8_t SNAPSHOT_FILE_TYPE = 'S'; const uint8_t DELTA_FILE_TYPE = 'D'; -static int getDefaultCompressionLevel(CompressionFilter filter) { - if (filter == CompressionFilter::NONE) { - return -1; -#ifdef ZLIB_LIB_SUPPORTED - } else if (filter == CompressionFilter::GZIP) { - // opt for high speed compression, larger levels have a high cpu cost and not much compression ratio - // improvement, according to benchmarks - return 1; -#endif - } else { - ASSERT(false); - return -1; - } -} - // Deltas in key order // For key-ordered delta files, the format for both sets and range clears is that you store boundaries ordered by key. @@ -300,11 +287,13 @@ struct IndexBlockRef { TraceEvent(SevDebug, "IndexBlockEncrypt_Before").detail("Chksum", chksum); } - EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey, - eKeys.headerCipherKey, - cipherKeysCtx.ivRef.begin(), - AES_256_IV_LENGTH, - ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + EncryptBlobCipherAes265Ctr encryptor( + eKeys.textCipherKey, + eKeys.headerCipherKey, + cipherKeysCtx.ivRef.begin(), + AES_256_IV_LENGTH, + getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), + BlobCipherMetrics::BLOB_GRANULE); Value serializedBuff = ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); BlobCipherEncryptHeader header; buffer = encryptor.encrypt(serializedBuff.contents().begin(), serializedBuff.contents().size(), &header, arena) @@ -332,7 +321,8 @@ struct IndexBlockRef { validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef); - DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin()); + DecryptBlobCipherAes256Ctr decryptor( + eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin(), BlobCipherMetrics::BLOB_GRANULE); StringRef decrypted = decryptor.decrypt(idxRef.buffer.begin(), idxRef.buffer.size(), header, arena)->toStringRef(); @@ -421,11 +411,13 @@ struct IndexBlobGranuleFileChunkRef { TraceEvent(SevDebug, "BlobChunkEncrypt_Before").detail("Chksum", chksum); } - EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey, - eKeys.headerCipherKey, - cipherKeysCtx.ivRef.begin(), - AES_256_IV_LENGTH, - ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + EncryptBlobCipherAes265Ctr encryptor( + eKeys.textCipherKey, + eKeys.headerCipherKey, + cipherKeysCtx.ivRef.begin(), + AES_256_IV_LENGTH, + getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), + BlobCipherMetrics::BLOB_GRANULE); BlobCipherEncryptHeader header; chunkRef.buffer = encryptor.encrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), &header, arena)->toStringRef(); @@ -454,7 +446,8 @@ struct IndexBlobGranuleFileChunkRef { validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef); - DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin()); + DecryptBlobCipherAes256Ctr decryptor( + eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin(), BlobCipherMetrics::BLOB_GRANULE); StringRef decrypted = decryptor.decrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), header, arena)->toStringRef(); @@ -471,8 +464,10 @@ struct IndexBlobGranuleFileChunkRef { const CompressionFilter compFilter, Arena& arena) { chunkRef.compressionFilter = compFilter; - chunkRef.buffer = CompressionUtils::compress( - chunkRef.compressionFilter.get(), chunk.contents(), getDefaultCompressionLevel(compFilter), arena); + chunkRef.buffer = CompressionUtils::compress(chunkRef.compressionFilter.get(), + chunk.contents(), + CompressionUtils::getDefaultCompressionLevel(compFilter), + arena); if (BG_ENCRYPT_COMPRESS_DEBUG) { XXH64_hash_t chunkChksum = XXH3_64bits(chunk.contents().begin(), chunk.contents().size()); @@ -816,10 +811,6 @@ static Standalone> loadSnapshotFile( ASSERT(file.indexBlockRef.block.children.size() >= 2); - // TODO: refactor this out of delta tree - // int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first, - // index.dataBlockOffsets.back().first); - // find range of blocks needed to read ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); @@ -1088,65 +1079,6 @@ ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beg } } -void applyDeltasSorted(const Standalone>& sortedDeltas, - bool startClear, - std::map& dataMap) { - if (sortedDeltas.empty() && !startClear) { - return; - } - - // sorted merge of 2 iterators - bool prevClear = startClear; - auto deltaIt = sortedDeltas.begin(); - auto snapshotIt = dataMap.begin(); - - while (deltaIt != sortedDeltas.end() && snapshotIt != dataMap.end()) { - if (deltaIt->key < snapshotIt->first) { - // Delta is lower than snapshot. Insert new row, if the delta is a set. Ignore point clear and noop - if (deltaIt->isSet()) { - snapshotIt = dataMap.insert(snapshotIt, { deltaIt->key, deltaIt->value }); - snapshotIt++; - } - prevClear = deltaIt->clearAfter; - deltaIt++; - } else if (snapshotIt->first < deltaIt->key) { - // Snapshot is lower than delta. Erase the current entry if the previous delta was a clearAfter - if (prevClear) { - snapshotIt = dataMap.erase(snapshotIt); - } else { - snapshotIt++; - } - } else { - // Delta and snapshot are for the same key. The delta is newer, so if it is a set, update the value, else if - // it's a clear, delete the value (ignore noop) - if (deltaIt->isSet()) { - snapshotIt->second = deltaIt->value; - } else if (deltaIt->isClear()) { - snapshotIt = dataMap.erase(snapshotIt); - } - if (!deltaIt->isClear()) { - snapshotIt++; - } - prevClear = deltaIt->clearAfter; - deltaIt++; - } - } - // Either we are out of deltas or out of snapshots. - // if snapshot remaining and prevClear last delta set, clear the rest of the map - if (prevClear && snapshotIt != dataMap.end()) { - CODE_PROBE(true, "last delta range cleared end of snapshot"); - dataMap.erase(snapshotIt, dataMap.end()); - } - // Apply remaining sets from delta, with no remaining snapshot - while (deltaIt != sortedDeltas.end()) { - if (deltaIt->isSet()) { - CODE_PROBE(true, "deltas past end of snapshot"); - snapshotIt = dataMap.insert(snapshotIt, { deltaIt->key, deltaIt->value }); - } - deltaIt++; - } -} - // The arena owns the BoundaryDeltaRef struct data but the StringRef pointers point to data in deltaData, to avoid extra // copying Standalone> loadChunkedDeltaFile(const Standalone& fileNameRef, @@ -1169,10 +1101,6 @@ Standalone> loadChunkedDeltaFile(const Standal ASSERT(file.indexBlockRef.block.children.size() >= 2); - // TODO: refactor this out of delta tree - // int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first, - // index.dataBlockOffsets.back().first); - // find range of blocks needed to read ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); @@ -1181,7 +1109,8 @@ Standalone> loadChunkedDeltaFile(const Standal return deltas; } - // TODO: could cpu optimize first block a bit more by seeking right to start + // FIXME: shared prefix for key comparison + // FIXME: could cpu optimize first block a bit more by seeking right to start bool lastBlock = false; bool prevClearAfter = false; while (!lastBlock) { @@ -1565,12 +1494,23 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, return mergeDeltaStreams(chunk, streams, startClears); } +struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted { + const ReadBlobGranuleContext* granuleContext; + int64_t loadId; + + GranuleLoadFreeHandle(const ReadBlobGranuleContext* granuleContext, int64_t loadId) + : granuleContext(granuleContext), loadId(loadId) {} + + ~GranuleLoadFreeHandle() { granuleContext->free_load_f(loadId, granuleContext->userContext); } +}; + struct GranuleLoadIds { Optional snapshotId; std::vector deltaIds; + std::vector> freeHandles; }; -static void startLoad(const ReadBlobGranuleContext granuleContext, +static void startLoad(const ReadBlobGranuleContext* granuleContext, const BlobGranuleChunkRef& chunk, GranuleLoadIds& loadIds) { @@ -1580,12 +1520,13 @@ static void startLoad(const ReadBlobGranuleContext granuleContext, // FIXME: remove when we implement file multiplexing ASSERT(chunk.snapshotFile.get().offset == 0); ASSERT(chunk.snapshotFile.get().length == chunk.snapshotFile.get().fullFileLength); - loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(), - snapshotFname.size(), - chunk.snapshotFile.get().offset, - chunk.snapshotFile.get().length, - chunk.snapshotFile.get().fullFileLength, - granuleContext.userContext); + loadIds.snapshotId = granuleContext->start_load_f(snapshotFname.c_str(), + snapshotFname.size(), + chunk.snapshotFile.get().offset, + chunk.snapshotFile.get().length, + chunk.snapshotFile.get().fullFileLength, + granuleContext->userContext); + loadIds.freeHandles.push_back(makeReference(granuleContext, loadIds.snapshotId.get())); } loadIds.deltaIds.reserve(chunk.deltaFiles.size()); for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) { @@ -1593,13 +1534,14 @@ static void startLoad(const ReadBlobGranuleContext granuleContext, // FIXME: remove when we implement file multiplexing ASSERT(chunk.deltaFiles[deltaFileIdx].offset == 0); ASSERT(chunk.deltaFiles[deltaFileIdx].length == chunk.deltaFiles[deltaFileIdx].fullFileLength); - int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(), - deltaFName.size(), - chunk.deltaFiles[deltaFileIdx].offset, - chunk.deltaFiles[deltaFileIdx].length, - chunk.deltaFiles[deltaFileIdx].fullFileLength, - granuleContext.userContext); + int64_t deltaLoadId = granuleContext->start_load_f(deltaFName.c_str(), + deltaFName.size(), + chunk.deltaFiles[deltaFileIdx].offset, + chunk.deltaFiles[deltaFileIdx].length, + chunk.deltaFiles[deltaFileIdx].fullFileLength, + granuleContext->userContext); loadIds.deltaIds.push_back(deltaLoadId); + loadIds.freeHandles.push_back(makeReference(granuleContext, deltaLoadId)); } } @@ -1607,7 +1549,8 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone loadAndMaterializeBlobGranules(const Standalone 1 - for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { - startLoad(granuleContext, files[i], loadIds[i]); - } + int64_t inputBytes = 0; + int64_t outputBytes = 0; try { + // Kick off first file reads if parallelism > 1 + for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { + startLoad(&granuleContext, files[i], loadIds[i]); + } RangeResult results; for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) { // Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1 if (chunkIdx + parallelism - 1 < files.size()) { - startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); + startLoad(&granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); } RangeResult chunkRows; @@ -1642,9 +1586,11 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(blob_granule_file_load_error()); } + inputBytes += snapshotData.get().size(); } - StringRef deltaData[files[chunkIdx].deltaFiles.size()]; + // +1 to avoid UBSAN variable length array of size zero + StringRef deltaData[files[chunkIdx].deltaFiles.size() + 1]; for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) { deltaData[i] = StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext), @@ -1653,22 +1599,25 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(blob_granule_file_load_error()); } + inputBytes += deltaData[i].size(); } + inputBytes += files[chunkIdx].newDeltas.expectedSize(); + // materialize rows from chunk chunkRows = materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData); + outputBytes += chunkRows.expectedSize(); + results.arena().dependsOn(chunkRows.arena()); results.append(results.arena(), chunkRows.begin(), chunkRows.size()); - if (loadIds[chunkIdx].snapshotId.present()) { - granuleContext.free_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext); - } - for (int i = 0; i < loadIds[chunkIdx].deltaIds.size(); i++) { - granuleContext.free_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext); - } + // free once done by forcing FreeHandles to trigger + loadIds[chunkIdx].freeHandles.clear(); } + stats.inputBytes = inputBytes; + stats.outputBytes = outputBytes; return ErrorOr(results); } catch (Error& e) { return ErrorOr(e); @@ -1723,23 +1672,13 @@ TEST_CASE("/blobgranule/files/applyDelta") { printf("Testing blob granule delta applying\n"); Arena a; - // do this 2 phase arena creation of string refs instead of LiteralStringRef because there is no char* StringRef - // constructor, and valgrind might complain if the stringref data isn't in the arena - std::string sk_a = "A"; - std::string sk_ab = "AB"; - std::string sk_b = "B"; - std::string sk_c = "C"; - std::string sk_z = "Z"; - std::string sval1 = "1"; - std::string sval2 = "2"; - - StringRef k_a = StringRef(a, sk_a); - StringRef k_ab = StringRef(a, sk_ab); - StringRef k_b = StringRef(a, sk_b); - StringRef k_c = StringRef(a, sk_c); - StringRef k_z = StringRef(a, sk_z); - StringRef val1 = StringRef(a, sval1); - StringRef val2 = StringRef(a, sval2); + StringRef k_a = StringRef(a, "A"_sr); + StringRef k_ab = StringRef(a, "AB"_sr); + StringRef k_b = StringRef(a, "B"_sr); + StringRef k_c = StringRef(a, "C"_sr); + StringRef k_z = StringRef(a, "Z"_sr); + StringRef val1 = StringRef(a, "1"_sr); + StringRef val2 = StringRef(a, "2"_sr); std::map data; data.insert({ k_a, val1 }); @@ -2033,7 +1972,7 @@ struct KeyValueGen { sharedPrefix = sharedPrefix.substr(0, sharedPrefixLen) + "_"; targetValueLength = deterministicRandom()->randomExp(0, 12); allRange = KeyRangeRef(StringRef(sharedPrefix), - sharedPrefix.size() == 0 ? LiteralStringRef("\xff") : strinc(StringRef(sharedPrefix))); + sharedPrefix.size() == 0 ? "\xff"_sr : strinc(StringRef(sharedPrefix))); if (deterministicRandom()->coinflip()) { clearFrequency = 0.0; @@ -2068,11 +2007,7 @@ struct KeyValueGen { cipherKeys = getCipherKeysCtx(ar); } if (deterministicRandom()->coinflip()) { -#ifdef ZLIB_LIB_SUPPORTED - compressFilter = CompressionFilter::GZIP; -#else - compressFilter = CompressionFilter::NONE; -#endif + compressFilter = CompressionUtils::getRandomFilter(); } } @@ -2208,7 +2143,6 @@ Standalone genSnapshot(KeyValueGen& kvGen, int targetDataBytes) while (totalDataBytes < targetDataBytes) { Optional key = kvGen.newKey(); if (!key.present()) { - CODE_PROBE(true, "snapshot unit test keyspace full"); break; } StringRef value = kvGen.value(); @@ -2253,10 +2187,8 @@ TEST_CASE("/blobgranule/files/validateEncryptionCompression") { BlobGranuleCipherKeysCtx cipherKeys = getCipherKeysCtx(ar); std::vector encryptionModes = { false, true }; std::vector> compressionModes; - compressionModes.push_back({}); -#ifdef ZLIB_LIB_SUPPORTED - compressionModes.push_back(CompressionFilter::GZIP); -#endif + compressionModes.insert( + compressionModes.end(), CompressionUtils::supportedFilters.begin(), CompressionUtils::supportedFilters.end()); std::vector snapshotValues; for (bool encryptionMode : encryptionModes) { @@ -2353,9 +2285,9 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { } checkSnapshotEmpty(serialized, normalKeys.begin, data.front().key, kvGen.cipherKeys); - checkSnapshotEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"), kvGen.cipherKeys); + checkSnapshotEmpty(serialized, normalKeys.begin, "\x00"_sr, kvGen.cipherKeys); checkSnapshotEmpty(serialized, keyAfter(data.back().key), normalKeys.end, kvGen.cipherKeys); - checkSnapshotEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end, kvGen.cipherKeys); + checkSnapshotEmpty(serialized, "\xfe"_sr, normalKeys.end, kvGen.cipherKeys); fmt::print("Snapshot format test done!\n"); @@ -2384,7 +2316,6 @@ void checkDeltaRead(const KeyValueGen& kvGen, std::string filename = randomBGFilename( deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta"); Standalone chunk; - // TODO need to add cipher keys meta chunk.deltaFiles.emplace_back_deep( chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys); chunk.keyRange = kvGen.allRange; @@ -2441,7 +2372,6 @@ static std::tuple randomizeKeyAndVersions(const KeyV } } - // TODO randomize begin and read version to sometimes +/- 1 and readRange begin and end to keyAfter sometimes return { readRange, beginVersion, readVersion }; } @@ -2665,7 +2595,11 @@ TEST_CASE("/blobgranule/files/granuleReadUnitTest") { serializedDeltaFiles, inMemoryDeltas); - for (int i = 0; i < std::min(100, 5 + snapshotData.size() * deltaData.size()); i++) { + // prevent overflow by doing min before multiply + int maxRuns = 100; + int snapshotAndDeltaSize = 5 + std::min(maxRuns, snapshotData.size()) * std::min(maxRuns, deltaData.size()); + int lim = std::min(maxRuns, snapshotAndDeltaSize); + for (int i = 0; i < lim; i++) { auto params = randomizeKeyAndVersions(kvGen, deltaData); fmt::print("Partial test {0}: [{1} - {2}) @ {3} - {4}\n", i, @@ -2968,9 +2902,8 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { std::vector encryptionModes = { false, true }; std::vector> compressionModes; compressionModes.push_back({}); -#ifdef ZLIB_LIB_SUPPORTED - compressionModes.push_back(CompressionFilter::GZIP); -#endif + compressionModes.insert( + compressionModes.end(), CompressionUtils::supportedFilters.begin(), CompressionUtils::supportedFilters.end()); std::vector runNames = { "logical" }; std::vector> snapshotMetrics; @@ -3000,6 +2933,10 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { if (!chunk && compressionFilter.present()) { continue; } + if (compressionFilter.present() && CompressionFilter::NONE == compressionFilter.get()) { + continue; + } + std::string name; if (!chunk) { name = "old"; @@ -3072,11 +3009,15 @@ TEST_CASE("!/blobgranule/files/benchFromFiles") { if (!chunk && encrypt) { continue; } + Optional keys = encrypt ? cipherKeys : Optional(); for (auto& compressionFilter : compressionModes) { if (!chunk && compressionFilter.present()) { continue; } + if (compressionFilter.present() && CompressionFilter::NONE == compressionFilter.get()) { + continue; + } std::string name; if (!chunk) { name = "old"; diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index e0f627a9da..583da353f7 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -31,13 +31,6 @@ #include "fdbclient/FDBTypes.h" #include "flow/actorcompiler.h" // This must be the last #include. -// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other -// sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything -// else is in 1 arena and discarded at the end. - -// TODO could refactor the file reading code from here and the delta file function into another actor, -// then this part would also be testable? but meh - ACTOR Future> readFile(Reference bstoreProvider, BlobFilePointerRef f) { try { state Arena arena; @@ -140,3 +133,66 @@ ACTOR Future readBlobGranules(BlobGranuleFileRequest request, return Void(); } + +// Return true if a given range is fully covered by blob chunks +bool isRangeFullyCovered(KeyRange range, Standalone> blobChunks) { + std::vector blobRanges; + for (const BlobGranuleChunkRef& chunk : blobChunks) { + blobRanges.push_back(chunk.keyRange); + } + + return range.isCovered(blobRanges); +} + +void testAddChunkRange(KeyRef begin, KeyRef end, Standalone>& chunks) { + BlobGranuleChunkRef chunk; + chunk.keyRange = KeyRangeRef(begin, end); + chunks.push_back(chunks.arena(), chunk); +} + +TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") { + Standalone> chunks; + // chunk1 key_a1 - key_a9 + testAddChunkRange("key_a1"_sr, "key_a9"_sr, chunks); + // chunk2 key_b1 - key_b9 + testAddChunkRange("key_b1"_sr, "key_b9"_sr, chunks); + + // check empty range. not covered + { ASSERT(isRangeFullyCovered(KeyRangeRef(), chunks) == false); } + + // check empty chunks. not covered + { + Standalone> empyChunks; + ASSERT(isRangeFullyCovered(KeyRangeRef(), empyChunks) == false); + } + + // check '' to \xff + { ASSERT(isRangeFullyCovered(KeyRangeRef(""_sr, "\xff"_sr), chunks) == false); } + + // check {key_a1, key_a9} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a9"_sr), chunks)); } + + // check {key_a1, key_a3} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a3"_sr), chunks)); } + + // check {key_a0, key_a3} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a0"_sr, "key_a3"_sr), chunks) == false); } + + // check {key_a5, key_b2} + { + auto range = KeyRangeRef("key_a5"_sr, "key_b5"_sr); + ASSERT(isRangeFullyCovered(range, chunks) == false); + ASSERT(range.begin == "key_a5"_sr); + ASSERT(range.end == "key_b5"_sr); + } + + // check continued chunks + { + Standalone> continuedChunks; + testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks); + testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks); + testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks); + ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false); + } + return Void(); +} diff --git a/fdbclient/BuildFlags.h.in b/fdbclient/BuildFlags.h.in index 6f94c540f8..b55c7e55dd 100644 --- a/fdbclient/BuildFlags.h.in +++ b/fdbclient/BuildFlags.h.in @@ -33,6 +33,9 @@ #define C_VERSION_MINOR 0 #endif +const char* kDate = __DATE__; +const char* kTime = __TIME__; + // FDB info. const std::string kGitHash = "@CURRENT_GIT_VERSION_WNL@"; const std::string kFdbVersion = "@FDB_VERSION@"; @@ -43,7 +46,7 @@ const std::string kArch = "@CMAKE_SYSTEM@"; const std::string kCompiler = "@CMAKE_CXX_COMPILER_ID@"; // Library versions. -const std::string kBoostVersion = "@Boost_LIB_VERSION@"; +const std::string kBoostVersion = BOOST_LIB_VERSION; // Build info and flags. const std::string kCMakeVersion = "@CMAKE_VERSION@"; @@ -61,6 +64,9 @@ std::string jsonBuildInformation() { json_spirit::mValue json; JSONDoc doc(json); + doc.create("build_date") = kDate; + doc.create("build_time") = kTime; + doc.create("git_hash") = kGitHash; doc.create("fdb_version") = kFdbVersion; diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 2953a360e7..0a477a5b8a 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -64,7 +64,7 @@ endif() if(WITH_AWS_BACKUP) - add_compile_definitions(BUILD_AWS_BACKUP) + add_compile_definitions(WITH_AWS_BACKUP) include(awssdk) endif() diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 4700fa547c..b15f7c9583 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -81,6 +81,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( CHANGE_FEED_CACHE_SIZE, 100000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_SIZE = 1; init( CHANGE_FEED_POP_TIMEOUT, 10.0 ); init( CHANGE_FEED_STREAM_MIN_BYTES, 1e4 ); if( randomize && BUGGIFY ) CHANGE_FEED_STREAM_MIN_BYTES = 1; + init( CHANGE_FEED_START_INTERVAL, 10.0 ); init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; @@ -263,12 +264,13 @@ void ClientKnobs::initialize(Randomize randomize) { init( MAX_TAGS_PER_TRANSACTION, 5 ); init( MAX_TRANSACTION_TAG_LENGTH, 16 ); init( COMMIT_SAMPLE_COST, 100 ); if( randomize && BUGGIFY ) COMMIT_SAMPLE_COST = 10; - init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096; init( INCOMPLETE_SHARD_PLUS, 4096 ); init( READ_TAG_SAMPLE_RATE, 0.01 ); if( randomize && BUGGIFY ) READ_TAG_SAMPLE_RATE = 1.0; // Communicated to clients from cluster init( TAG_THROTTLE_SMOOTHING_WINDOW, 2.0 ); init( TAG_THROTTLE_RECHECK_INTERVAL, 5.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_RECHECK_INTERVAL = 0.0; init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0; + init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096; + init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096; // busyness reporting init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 ); @@ -276,7 +278,7 @@ void ClientKnobs::initialize(Randomize randomize) { // Blob granules init( BG_MAX_GRANULE_PARALLELISM, 10 ); - init( BG_TOO_MANY_GRANULES, 1000 ); + init( BG_TOO_MANY_GRANULES, 10000 ); init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 ); init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 ); @@ -289,7 +291,9 @@ void ClientKnobs::initialize(Randomize randomize) { init( METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, 5 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK = 1; init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60; init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59; + init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING, false ); // clang-format on } diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp index 80d4a16cc9..548f03c46c 100644 --- a/fdbclient/DatabaseBackupAgent.actor.cpp +++ b/fdbclient/DatabaseBackupAgent.actor.cpp @@ -37,11 +37,11 @@ #include "flow/actorcompiler.h" // has to be last include -const Key DatabaseBackupAgent::keyAddPrefix = LiteralStringRef("add_prefix"); -const Key DatabaseBackupAgent::keyRemovePrefix = LiteralStringRef("remove_prefix"); -const Key DatabaseBackupAgent::keyRangeVersions = LiteralStringRef("range_versions"); -const Key DatabaseBackupAgent::keyCopyStop = LiteralStringRef("copy_stop"); -const Key DatabaseBackupAgent::keyDatabasesInSync = LiteralStringRef("databases_in_sync"); +const Key DatabaseBackupAgent::keyAddPrefix = "add_prefix"_sr; +const Key DatabaseBackupAgent::keyRemovePrefix = "remove_prefix"_sr; +const Key DatabaseBackupAgent::keyRangeVersions = "range_versions"_sr; +const Key DatabaseBackupAgent::keyCopyStop = "copy_stop"_sr; +const Key DatabaseBackupAgent::keyDatabasesInSync = "databases_in_sync"_sr; const int DatabaseBackupAgent::LATEST_DR_VERSION = 1; DatabaseBackupAgent::DatabaseBackupAgent() @@ -75,14 +75,13 @@ DatabaseBackupAgent::DatabaseBackupAgent(Database src) class DRConfig { public: DRConfig(UID uid = UID()) - : uid(uid), - configSpace(uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(databaseBackupPrefixRange.begin), uid)) {} + : uid(uid), configSpace(uidPrefixKey("uid->config/"_sr.withPrefix(databaseBackupPrefixRange.begin), uid)) {} DRConfig(Reference task) : DRConfig(BinaryReader::fromStringRef(task->params[BackupAgentBase::keyConfigLogUid], Unversioned())) {} - KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(__FUNCTION__sr); } void clear(Reference tr) { tr->clear(configSpace.range()); } @@ -137,7 +136,7 @@ struct BackupRangeTaskFunc : TaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam bytesWritten() { return LiteralStringRef(__FUNCTION__); } + static TaskParam bytesWritten() { return __FUNCTION__sr; } } Params; static const Key keyAddBackupRangeTasks; @@ -203,7 +202,7 @@ struct BackupRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _execute(Database cx, @@ -405,10 +404,10 @@ struct BackupRangeTaskFunc : TaskFuncBase { break; if (backupVersions.get()[versionLoc + 1].key == - (removePrefix == StringRef() ? normalKeys.end : strinc(removePrefix))) { + (removePrefix == StringRef() ? allKeys.end : strinc(removePrefix))) { tr->clear(KeyRangeRef( backupVersions.get()[versionLoc].key.removePrefix(removePrefix).withPrefix(addPrefix), - addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix))); + addPrefix == StringRef() ? allKeys.end : strinc(addPrefix))); } else { tr->clear(KeyRangeRef(backupVersions.get()[versionLoc].key, backupVersions.get()[versionLoc + 1].key) @@ -536,9 +535,9 @@ struct BackupRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef BackupRangeTaskFunc::name = LiteralStringRef("dr_backup_range"); -const Key BackupRangeTaskFunc::keyAddBackupRangeTasks = LiteralStringRef("addBackupRangeTasks"); -const Key BackupRangeTaskFunc::keyBackupRangeBeginKey = LiteralStringRef("backupRangeBeginKey"); +StringRef BackupRangeTaskFunc::name = "dr_backup_range"_sr; +const Key BackupRangeTaskFunc::keyAddBackupRangeTasks = "addBackupRangeTasks"_sr; +const Key BackupRangeTaskFunc::keyBackupRangeBeginKey = "backupRangeBeginKey"_sr; REGISTER_TASKFUNC(BackupRangeTaskFunc); struct FinishFullBackupTaskFunc : TaskFuncBase { @@ -588,7 +587,7 @@ struct FinishFullBackupTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -606,7 +605,7 @@ struct FinishFullBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef FinishFullBackupTaskFunc::name = LiteralStringRef("dr_finish_full_backup"); +StringRef FinishFullBackupTaskFunc::name = "dr_finish_full_backup"_sr; REGISTER_TASKFUNC(FinishFullBackupTaskFunc); struct EraseLogRangeTaskFunc : TaskFuncBase { @@ -683,7 +682,7 @@ struct EraseLogRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -697,7 +696,7 @@ struct EraseLogRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("dr_erase_log_range"); +StringRef EraseLogRangeTaskFunc::name = "dr_erase_log_range"_sr; REGISTER_TASKFUNC(EraseLogRangeTaskFunc); struct CopyLogRangeTaskFunc : TaskFuncBase { @@ -705,7 +704,7 @@ struct CopyLogRangeTaskFunc : TaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam bytesWritten() { return LiteralStringRef(__FUNCTION__); } + static TaskParam bytesWritten() { return __FUNCTION__sr; } } Params; static const Key keyNextBeginVersion; @@ -958,7 +957,7 @@ struct CopyLogRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -989,8 +988,8 @@ struct CopyLogRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef CopyLogRangeTaskFunc::name = LiteralStringRef("dr_copy_log_range"); -const Key CopyLogRangeTaskFunc::keyNextBeginVersion = LiteralStringRef("nextBeginVersion"); +StringRef CopyLogRangeTaskFunc::name = "dr_copy_log_range"_sr; +const Key CopyLogRangeTaskFunc::keyNextBeginVersion = "nextBeginVersion"_sr; REGISTER_TASKFUNC(CopyLogRangeTaskFunc); struct CopyLogsTaskFunc : TaskFuncBase { @@ -1125,7 +1124,7 @@ struct CopyLogsTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -1143,7 +1142,7 @@ struct CopyLogsTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef CopyLogsTaskFunc::name = LiteralStringRef("dr_copy_logs"); +StringRef CopyLogsTaskFunc::name = "dr_copy_logs"_sr; REGISTER_TASKFUNC(CopyLogsTaskFunc); struct FinishedFullBackupTaskFunc : TaskFuncBase { @@ -1235,7 +1234,7 @@ struct FinishedFullBackupTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -1283,8 +1282,8 @@ struct FinishedFullBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef FinishedFullBackupTaskFunc::name = LiteralStringRef("dr_finished_full_backup"); -const Key FinishedFullBackupTaskFunc::keyInsertTask = LiteralStringRef("insertTask"); +StringRef FinishedFullBackupTaskFunc::name = "dr_finished_full_backup"_sr; +const Key FinishedFullBackupTaskFunc::keyInsertTask = "insertTask"_sr; REGISTER_TASKFUNC(FinishedFullBackupTaskFunc); struct CopyDiffLogsTaskFunc : TaskFuncBase { @@ -1396,7 +1395,7 @@ struct CopyDiffLogsTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -1414,7 +1413,7 @@ struct CopyDiffLogsTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef CopyDiffLogsTaskFunc::name = LiteralStringRef("dr_copy_diff_logs"); +StringRef CopyDiffLogsTaskFunc::name = "dr_copy_diff_logs"_sr; REGISTER_TASKFUNC(CopyDiffLogsTaskFunc); // Skip unneeded EraseLogRangeTaskFunc in 5.1 @@ -1446,7 +1445,7 @@ struct SkipOldEraseLogRangeTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef SkipOldEraseLogRangeTaskFunc::name = LiteralStringRef("dr_skip_legacy_task"); +StringRef SkipOldEraseLogRangeTaskFunc::name = "dr_skip_legacy_task"_sr; REGISTER_TASKFUNC(SkipOldEraseLogRangeTaskFunc); REGISTER_TASKFUNC_ALIAS(SkipOldEraseLogRangeTaskFunc, db_erase_log_range); @@ -1456,7 +1455,7 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam bytesWritten() { return LiteralStringRef(__FUNCTION__); } + static TaskParam bytesWritten() { return __FUNCTION__sr; } } Params; static const Key keyNextBeginVersion; @@ -1652,7 +1651,7 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -1683,8 +1682,8 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef OldCopyLogRangeTaskFunc::name = LiteralStringRef("db_copy_log_range"); -const Key OldCopyLogRangeTaskFunc::keyNextBeginVersion = LiteralStringRef("nextBeginVersion"); +StringRef OldCopyLogRangeTaskFunc::name = "db_copy_log_range"_sr; +const Key OldCopyLogRangeTaskFunc::keyNextBeginVersion = "nextBeginVersion"_sr; REGISTER_TASKFUNC(OldCopyLogRangeTaskFunc); struct AbortOldBackupTaskFunc : TaskFuncBase { @@ -1753,7 +1752,7 @@ struct AbortOldBackupTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -1771,7 +1770,7 @@ struct AbortOldBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef AbortOldBackupTaskFunc::name = LiteralStringRef("dr_abort_legacy_backup"); +StringRef AbortOldBackupTaskFunc::name = "dr_abort_legacy_backup"_sr; REGISTER_TASKFUNC(AbortOldBackupTaskFunc); REGISTER_TASKFUNC_ALIAS(AbortOldBackupTaskFunc, db_backup_range); REGISTER_TASKFUNC_ALIAS(AbortOldBackupTaskFunc, db_finish_full_backup); @@ -1834,13 +1833,16 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase { return Void(); } - if (backupRanges.size() == 1) { + if (backupRanges.size() == 1 || isDefaultBackup(backupRanges)) { RangeResult existingDestUidValues = wait(srcTr->getRange( KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); bool found = false; + KeyRangeRef targetRange = + (backupRanges.size() == 1) ? backupRanges[0] : getDefaultBackupSharedRange(); for (auto it : existingDestUidValues) { - if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), - IncludeVersion()) == backupRanges[0]) { + KeyRange uidRange = BinaryReader::fromStringRef( + it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + if (uidRange == targetRange) { if (destUidValue != it.value) { // existing backup/DR is running return Void(); @@ -1856,7 +1858,7 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase { } srcTr->set( - BinaryWriter::toValue(backupRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())) + BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations())) .withPrefix(destUidLookupPrefix), destUidValue); } @@ -1918,7 +1920,7 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef CopyDiffLogsUpgradeTaskFunc::name = LiteralStringRef("db_copy_diff_logs"); +StringRef CopyDiffLogsUpgradeTaskFunc::name = "db_copy_diff_logs"_sr; REGISTER_TASKFUNC(CopyDiffLogsUpgradeTaskFunc); struct BackupRestorableTaskFunc : TaskFuncBase { @@ -2031,7 +2033,7 @@ struct BackupRestorableTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -2049,7 +2051,7 @@ struct BackupRestorableTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef BackupRestorableTaskFunc::name = LiteralStringRef("dr_backup_restorable"); +StringRef BackupRestorableTaskFunc::name = "dr_backup_restorable"_sr; REGISTER_TASKFUNC(BackupRestorableTaskFunc); struct StartFullBackupTaskFunc : TaskFuncBase { @@ -2078,24 +2080,29 @@ struct StartFullBackupTaskFunc : TaskFuncBase { srcTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); // Initialize destUid - if (backupRanges.size() == 1) { + if (backupRanges.size() == 1 || isDefaultBackup(backupRanges)) { RangeResult existingDestUidValues = wait(srcTr->getRange( KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); + KeyRangeRef targetRange = + (backupRanges.size() == 1) ? backupRanges[0] : getDefaultBackupSharedRange(); bool found = false; for (auto it : existingDestUidValues) { - if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), - IncludeVersion()) == backupRanges[0]) { + KeyRange uidRange = BinaryReader::fromStringRef( + it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + if (uidRange == targetRange) { destUidValue = it.value; found = true; + CODE_PROBE(targetRange == getDefaultBackupSharedRange(), + "DR mutation sharing with default backup"); break; } } if (!found) { destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned()); - srcTr->set(BinaryWriter::toValue(backupRanges[0], - IncludeVersion(ProtocolVersion::withSharedMutations())) - .withPrefix(destUidLookupPrefix), - destUidValue); + srcTr->set( + BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations())) + .withPrefix(destUidLookupPrefix), + destUidValue); } } @@ -2281,7 +2288,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase { task->params[BackupAgentBase::keyConfigBackupRanges] = keyConfigBackupRanges; task->params[BackupAgentBase::keyTagName] = tagName; task->params[DatabaseBackupAgent::keyDatabasesInSync] = - backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? LiteralStringRef("t") : LiteralStringRef("f"); + backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? "t"_sr : "f"_sr; if (!waitFor) { return taskBucket->addTask(tr, @@ -2301,7 +2308,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase { .get(logUid) .pack(BackupAgentBase::keyFolderId), task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -2319,7 +2326,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef StartFullBackupTaskFunc::name = LiteralStringRef("dr_start_full_backup"); +StringRef StartFullBackupTaskFunc::name = "dr_start_full_backup"_sr; REGISTER_TASKFUNC(StartFullBackupTaskFunc); } // namespace dbBackup @@ -2625,7 +2632,7 @@ public: int64_t startCount = 0; state Key mapPrefix = logUidValue.withPrefix(applyMutationsKeyVersionMapRange.begin); - Key mapEnd = normalKeys.end.withPrefix(mapPrefix); + Key mapEnd = allKeys.end.withPrefix(mapPrefix); tr->set(logUidValue.withPrefix(applyMutationsAddPrefixRange.begin), addPrefix); tr->set(logUidValue.withPrefix(applyMutationsRemovePrefixRange.begin), removePrefix); tr->set(logUidValue.withPrefix(applyMutationsKeyVersionCountRange.begin), StringRef((uint8_t*)&startCount, 8)); @@ -3061,6 +3068,9 @@ public: loop { try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + wait(success(tr->getReadVersion())); // get the read version before getting a version from the source // database to prevent the time differential from going negative @@ -3072,9 +3082,6 @@ public: state UID logUid = wait(backupAgent->getLogUid(tr, tagName)); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Future> fPaused = tr->get(backupAgent->taskBucket->getPauseKey()); state Future fErrorValues = errorLimit > 0 diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp index f19782346a..76fded095c 100644 --- a/fdbclient/DatabaseConfiguration.cpp +++ b/fdbclient/DatabaseConfiguration.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/FDBTypes.h" #include "fdbclient/SystemData.h" #include "flow/ITrace.h" #include "flow/Trace.h" @@ -53,6 +54,7 @@ void DatabaseConfiguration::resetInternal() { storageMigrationType = StorageMigrationType::DEFAULT; blobGranulesEnabled = false; tenantMode = TenantMode::DISABLED; + encryptionAtRestMode = EncryptionAtRestMode::DISABLED; } int toInt(ValueRef const& v) { @@ -64,6 +66,16 @@ void parse(int* i, ValueRef const& v) { *i = atoi(v.toString().c_str()); } +void parse(int64_t* i, ValueRef const& v) { + // FIXME: Sanity checking + *i = atoll(v.toString().c_str()); +} + +void parse(double* i, ValueRef const& v) { + // FIXME: Sanity checking + *i = atof(v.toString().c_str()); +} + void parseReplicationPolicy(Reference* policy, ValueRef const& v) { BinaryReader reader(v, IncludeVersion()); serializeReplicationPolicy(reader, *policy); @@ -213,7 +225,8 @@ bool DatabaseConfiguration::isValid() const { (perpetualStorageWiggleSpeed == 0 || perpetualStorageWiggleSpeed == 1) && isValidPerpetualStorageWiggleLocality(perpetualStorageWiggleLocality) && storageMigrationType != StorageMigrationType::UNSET && tenantMode >= TenantMode::DISABLED && - tenantMode < TenantMode::END)) { + tenantMode < TenantMode::END && encryptionAtRestMode >= EncryptionAtRestMode::DISABLED && + encryptionAtRestMode < EncryptionAtRestMode::END)) { return false; } std::set dcIds; @@ -413,6 +426,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const { result["storage_migration_type"] = storageMigrationType.toString(); result["blob_granules_enabled"] = (int32_t)blobGranulesEnabled; result["tenant_mode"] = tenantMode.toString(); + result["encryption_at_rest_mode"] = encryptionAtRestMode.toString(); return result; } @@ -546,38 +560,38 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { KeyRef ck = key.removePrefix(configKeysPrefix); int type; - if (ck == LiteralStringRef("initialized")) { + if (ck == "initialized"_sr) { initialized = true; - } else if (ck == LiteralStringRef("commit_proxies")) { + } else if (ck == "commit_proxies"_sr) { commitProxyCount = toInt(value); if (commitProxyCount == -1) overwriteProxiesCount(); - } else if (ck == LiteralStringRef("grv_proxies")) { + } else if (ck == "grv_proxies"_sr) { grvProxyCount = toInt(value); if (grvProxyCount == -1) overwriteProxiesCount(); - } else if (ck == LiteralStringRef("resolvers")) { + } else if (ck == "resolvers"_sr) { parse(&resolverCount, value); - } else if (ck == LiteralStringRef("logs")) { + } else if (ck == "logs"_sr) { parse(&desiredTLogCount, value); - } else if (ck == LiteralStringRef("log_replicas")) { + } else if (ck == "log_replicas"_sr) { parse(&tLogReplicationFactor, value); tLogWriteAntiQuorum = std::min(tLogWriteAntiQuorum, tLogReplicationFactor / 2); - } else if (ck == LiteralStringRef("log_anti_quorum")) { + } else if (ck == "log_anti_quorum"_sr) { parse(&tLogWriteAntiQuorum, value); if (tLogReplicationFactor > 0) { tLogWriteAntiQuorum = std::min(tLogWriteAntiQuorum, tLogReplicationFactor / 2); } - } else if (ck == LiteralStringRef("storage_replicas")) { + } else if (ck == "storage_replicas"_sr) { parse(&storageTeamSize, value); - } else if (ck == LiteralStringRef("tss_count")) { + } else if (ck == "tss_count"_sr) { parse(&desiredTSSCount, value); - } else if (ck == LiteralStringRef("log_version")) { + } else if (ck == "log_version"_sr) { parse((&type), value); type = std::max((int)TLogVersion::MIN_RECRUITABLE, type); type = std::min((int)TLogVersion::MAX_SUPPORTED, type); tLogVersion = (TLogVersion::Version)type; - } else if (ck == LiteralStringRef("log_engine")) { + } else if (ck == "log_engine"_sr) { parse((&type), value); tLogDataStoreType = (KeyValueStoreType::StoreType)type; // TODO: Remove this once Redwood works as a log engine @@ -588,61 +602,63 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { if (tLogDataStoreType == KeyValueStoreType::MEMORY_RADIXTREE) { tLogDataStoreType = KeyValueStoreType::SSD_BTREE_V2; } - } else if (ck == LiteralStringRef("log_spill")) { + } else if (ck == "log_spill"_sr) { parse((&type), value); tLogSpillType = (TLogSpillType::SpillType)type; - } else if (ck == LiteralStringRef("storage_engine")) { + } else if (ck == "storage_engine"_sr) { parse((&type), value); storageServerStoreType = (KeyValueStoreType::StoreType)type; - } else if (ck == LiteralStringRef("tss_storage_engine")) { + } else if (ck == "tss_storage_engine"_sr) { parse((&type), value); testingStorageServerStoreType = (KeyValueStoreType::StoreType)type; - } else if (ck == LiteralStringRef("auto_commit_proxies")) { + } else if (ck == "auto_commit_proxies"_sr) { parse(&autoCommitProxyCount, value); - } else if (ck == LiteralStringRef("auto_grv_proxies")) { + } else if (ck == "auto_grv_proxies"_sr) { parse(&autoGrvProxyCount, value); - } else if (ck == LiteralStringRef("auto_resolvers")) { + } else if (ck == "auto_resolvers"_sr) { parse(&autoResolverCount, value); - } else if (ck == LiteralStringRef("auto_logs")) { + } else if (ck == "auto_logs"_sr) { parse(&autoDesiredTLogCount, value); - } else if (ck == LiteralStringRef("storage_replication_policy")) { + } else if (ck == "storage_replication_policy"_sr) { parseReplicationPolicy(&storagePolicy, value); - } else if (ck == LiteralStringRef("log_replication_policy")) { + } else if (ck == "log_replication_policy"_sr) { parseReplicationPolicy(&tLogPolicy, value); - } else if (ck == LiteralStringRef("log_routers")) { + } else if (ck == "log_routers"_sr) { parse(&desiredLogRouterCount, value); - } else if (ck == LiteralStringRef("remote_logs")) { + } else if (ck == "remote_logs"_sr) { parse(&remoteDesiredTLogCount, value); - } else if (ck == LiteralStringRef("remote_log_replicas")) { + } else if (ck == "remote_log_replicas"_sr) { parse(&remoteTLogReplicationFactor, value); - } else if (ck == LiteralStringRef("remote_log_policy")) { + } else if (ck == "remote_log_policy"_sr) { parseReplicationPolicy(&remoteTLogPolicy, value); - } else if (ck == LiteralStringRef("backup_worker_enabled")) { + } else if (ck == "backup_worker_enabled"_sr) { parse((&type), value); backupWorkerEnabled = (type != 0); - } else if (ck == LiteralStringRef("usable_regions")) { + } else if (ck == "usable_regions"_sr) { parse(&usableRegions, value); - } else if (ck == LiteralStringRef("repopulate_anti_quorum")) { + } else if (ck == "repopulate_anti_quorum"_sr) { parse(&repopulateRegionAntiQuorum, value); - } else if (ck == LiteralStringRef("regions")) { + } else if (ck == "regions"_sr) { parse(®ions, value); - } else if (ck == LiteralStringRef("perpetual_storage_wiggle")) { + } else if (ck == "perpetual_storage_wiggle"_sr) { parse(&perpetualStorageWiggleSpeed, value); - } else if (ck == LiteralStringRef("perpetual_storage_wiggle_locality")) { + } else if (ck == "perpetual_storage_wiggle_locality"_sr) { if (!isValidPerpetualStorageWiggleLocality(value.toString())) { return false; } perpetualStorageWiggleLocality = value.toString(); - } else if (ck == LiteralStringRef("storage_migration_type")) { + } else if (ck == "storage_migration_type"_sr) { parse((&type), value); storageMigrationType = (StorageMigrationType::MigrationType)type; - } else if (ck == LiteralStringRef("tenant_mode")) { + } else if (ck == "tenant_mode"_sr) { tenantMode = TenantMode::fromValue(value); - } else if (ck == LiteralStringRef("proxies")) { + } else if (ck == "proxies"_sr) { overwriteProxiesCount(); - } else if (ck == LiteralStringRef("blob_granules_enabled")) { + } else if (ck == "blob_granules_enabled"_sr) { parse((&type), value); blobGranulesEnabled = (type != 0); + } else if (ck == "encryption_at_rest_mode"_sr) { + encryptionAtRestMode = EncryptionAtRestMode::fromValue(value); } else { return false; } diff --git a/fdbclient/FDBAWSCredentialsProvider.cpp b/fdbclient/FDBAWSCredentialsProvider.cpp index ceef03f935..304b1fcd2d 100644 --- a/fdbclient/FDBAWSCredentialsProvider.cpp +++ b/fdbclient/FDBAWSCredentialsProvider.cpp @@ -22,7 +22,7 @@ #include "fdbclient/FDBAWSCredentialsProvider.h" #include "fdbclient/Tracing.h" -#ifdef BUILD_AWS_BACKUP +#ifdef WITH_AWS_BACKUP // You're supposed to call AWS::ShutdownAPI(options); once done // But we want this to live for the lifetime of the process, so we don't do that diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index f579ce51a5..3a18359f84 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -21,14 +21,27 @@ #include "fmt/format.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" +#include "fdbclient/BlobCipher.h" #include "fdbclient/DatabaseContext.h" +#include "fdbclient/GetEncryptCipherKeys.actor.h" +#include "fdbclient/JsonBuilder.h" +#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/RestoreInterface.h" #include "fdbclient/Status.h" #include "fdbclient/SystemData.h" -#include "fdbclient/KeyBackedTypes.h" -#include "fdbclient/JsonBuilder.h" +#include "fdbclient/Tenant.h" +#include "fdbclient/TenantEntryCache.actor.h" + +#include "flow/Arena.h" +#include "flow/CodeProbe.h" +#include "flow/EncryptUtils.h" +#include "flow/network.h" +#include "flow/ObjectSerializer.h" +#include "flow/ProtocolVersion.h" +#include "flow/serialize.h" +#include "flow/Trace.h" #include #include @@ -36,10 +49,15 @@ #include "flow/IAsyncFile.h" #include "flow/genericactors.actor.h" #include "flow/Hash3.h" +#include "flow/xxhash.h" + +#include #include #include #include #include +#include +#include #include "flow/actorcompiler.h" // This must be the last #include. @@ -47,7 +65,7 @@ FDB_DEFINE_BOOLEAN_PARAM(IncrementalBackupOnly); FDB_DEFINE_BOOLEAN_PARAM(OnlyApplyMutationLogs); #define SevFRTestInfo SevVerbose -//#define SevFRTestInfo SevInfo +// #define SevFRTestInfo SevInfo static std::string boolToYesOrNo(bool val) { return val ? std::string("Yes") : std::string("No"); @@ -90,7 +108,7 @@ std::string secondsToTimeFormat(int64_t seconds) { return format("%lld second(s)", seconds); } -const Key FileBackupAgent::keyLastRestorable = LiteralStringRef("last_restorable"); +const Key FileBackupAgent::keyLastRestorable = "last_restorable"_sr; // For convenience typedef FileBackupAgent::ERestoreState ERestoreState; @@ -98,19 +116,19 @@ typedef FileBackupAgent::ERestoreState ERestoreState; StringRef FileBackupAgent::restoreStateText(ERestoreState id) { switch (id) { case ERestoreState::UNITIALIZED: - return LiteralStringRef("unitialized"); + return "unitialized"_sr; case ERestoreState::QUEUED: - return LiteralStringRef("queued"); + return "queued"_sr; case ERestoreState::STARTING: - return LiteralStringRef("starting"); + return "starting"_sr; case ERestoreState::RUNNING: - return LiteralStringRef("running"); + return "running"_sr; case ERestoreState::COMPLETED: - return LiteralStringRef("completed"); + return "completed"_sr; case ERestoreState::ABORTED: - return LiteralStringRef("aborted"); + return "aborted"_sr; default: - return LiteralStringRef("Unknown"); + return "Unknown"_sr; } } @@ -139,41 +157,37 @@ public: RestoreConfig(UID uid = UID()) : KeyBackedConfig(fileRestorePrefixRange.begin, uid) {} RestoreConfig(Reference task) : KeyBackedConfig(fileRestorePrefixRange.begin, task) {} - KeyBackedProperty stateEnum() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty stateEnum() { return configSpace.pack(__FUNCTION__sr); } Future stateText(Reference tr) { return map(stateEnum().getD(tr), [](ERestoreState s) -> StringRef { return FileBackupAgent::restoreStateText(s); }); } - KeyBackedProperty addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty onlyApplyMutationLogs() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty addPrefix() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty removePrefix() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); } // XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges - KeyBackedProperty restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty> restoreRanges() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } - KeyBackedProperty batchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty beginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty restoreVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty firstConsistentVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty restoreRange() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty> restoreRanges() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty batchFuture() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty beginVersion() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty restoreVersion() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty firstConsistentVersion() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty> sourceContainer() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty> sourceContainer() { return configSpace.pack(__FUNCTION__sr); } // Get the source container as a bare URL, without creating a container instance - KeyBackedProperty sourceContainerURL() { return configSpace.pack(LiteralStringRef("sourceContainer")); } + KeyBackedProperty sourceContainerURL() { return configSpace.pack("sourceContainer"_sr); } // Total bytes written by all log and range restore tasks. - KeyBackedBinaryValue bytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue bytesWritten() { return configSpace.pack(__FUNCTION__sr); } // File blocks that have had tasks created for them by the Dispatch task - KeyBackedBinaryValue filesBlocksDispatched() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue filesBlocksDispatched() { return configSpace.pack(__FUNCTION__sr); } // File blocks whose tasks have finished - KeyBackedBinaryValue fileBlocksFinished() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue fileBlocksFinished() { return configSpace.pack(__FUNCTION__sr); } // Total number of files in the fileMap - KeyBackedBinaryValue fileCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue fileCount() { return configSpace.pack(__FUNCTION__sr); } // Total number of file blocks in the fileMap - KeyBackedBinaryValue fileBlockCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue fileBlockCount() { return configSpace.pack(__FUNCTION__sr); } Future> getRestoreRangesOrDefault(Reference tr) { return getRestoreRangesOrDefault_impl(this, tr); @@ -216,7 +230,7 @@ public: }; typedef KeyBackedSet FileSetT; - FileSetT fileSet() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + FileSetT fileSet() { return configSpace.pack(__FUNCTION__sr); } Future isRunnable(Reference tr) { return map(stateEnum().getD(tr), [](ERestoreState s) -> bool { @@ -458,8 +472,445 @@ Value makePadding(int size) { return pad.substr(0, size); } +struct IRangeFileWriter { +public: + virtual Future padEnd(bool final) = 0; + + virtual Future writeKV(Key k, Value v) = 0; + + virtual Future writeKey(Key k) = 0; + + virtual Future finish() = 0; + + virtual ~IRangeFileWriter() {} +}; + +struct SnapshotFileBackupEncryptionKeys { + Reference textCipherKey; + EncryptCipherDomainName textDomain; + Reference headerCipherKey; + StringRef ivRef; +}; + // File Format handlers. -// Both Range and Log formats are designed to be readable starting at any 1MB boundary +// Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary +// so they can be read in parallel. +// +// Writer instances must be kept alive while any member actors are in progress. +// +// EncryptedRangeFileWriter must be used as follows: +// 1 - writeKey(key) the queried key range begin +// 2 - writeKV(k, v) each kv pair to restore +// 3 - writeKey(key) the queried key range end +// 4 - finish() +// +// EncryptedRangeFileWriter will insert the required padding, header, and extra +// end/begin keys around the 1MB boundaries as needed. +// +// Example: +// The range a-z is queries and returns c-j which covers 3 blocks across 2 tenants. +// The client code writes keys in this sequence: +// t1a t1c t1d t1e t1f t1g t2h t2i t2j t2z +// +// H = header P = padding a...z = keys v = value | = block boundary +// +// Encoded file: H t1a t1cv t1dv t1ev P | H t1e t1ev t1fv t1gv t2 P | H t2 t2hv t2iv t2jv t2z +// Decoded in blocks yields: +// Block 1: range [t1a, t1e) with kv pairs t1cv, t1dv +// Block 2: range [t1e, t2) with kv pairs t1ev, t1fv, t1gv +// Block 3: range [t2, t2z) with kv pairs t2hv, t2iv, t2jv +// +// NOTE: All blocks except for the final block will have one last +// value which will not be used. This isn't actually a waste since +// if the next KV pair wouldn't fit within the block after the value +// then the space after the final key to the next 1MB boundary would +// just be padding anyway. +// +// NOTE: For the EncryptedRangeFileWriter blocks will be split either on the BACKUP_RANGEFILE_BLOCK_SIZE boundary or +// when a new tenant id is encountered. If a block is split for crossing tenant boundaries then the last key will be +// truncated to just the tenant prefix and the value will be empty (to avoid having sensitive data of one tenant be +// encrypted with a key for a different tenant) +struct EncryptedRangeFileWriter : public IRangeFileWriter { + struct Options { + constexpr static FileIdentifier file_identifier = 3152016; + + // TODO: Compression is not currently supported so this should always be false + bool compressionEnabled = false; + + Options() {} + + template + void serialize(Ar& ar) { + serializer(ar, compressionEnabled); + } + }; + + EncryptedRangeFileWriter(Database cx, + Arena* arena, + Reference> tenantCache, + Reference file = Reference(), + int blockSize = 0, + Options options = Options()) + : cx(cx), arena(arena), tenantCache(tenantCache), file(file), blockSize(blockSize), blockEnd(0), + fileVersion(BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION), options(options) { + buffer = makeString(blockSize); + wPtr = mutateString(buffer); + } + + static void validateEncryptionHeader(Reference headerCipherKey, + Reference textCipherKey, + BlobCipherEncryptHeader& header) { + // Validate encryption header 'cipherHeader' details + if (!(header.cipherHeaderDetails.baseCipherId == headerCipherKey->getBaseCipherId() && + header.cipherHeaderDetails.encryptDomainId == headerCipherKey->getDomainId() && + header.cipherHeaderDetails.salt == headerCipherKey->getSalt())) { + TraceEvent(SevWarn, "EncryptionHeader_CipherHeaderMismatch") + .detail("HeaderDomainId", headerCipherKey->getDomainId()) + .detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId) + .detail("HeaderBaseCipherId", headerCipherKey->getBaseCipherId()) + .detail("ExpectedHeaderBaseCipherId", header.cipherHeaderDetails.baseCipherId) + .detail("HeaderSalt", headerCipherKey->getSalt()) + .detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt); + throw encrypt_header_metadata_mismatch(); + } + + // Validate encryption text 'cipherText' details sanity + if (!(header.cipherTextDetails.baseCipherId == textCipherKey->getBaseCipherId() && + header.cipherTextDetails.encryptDomainId == textCipherKey->getDomainId() && + header.cipherTextDetails.salt == textCipherKey->getSalt())) { + TraceEvent(SevWarn, "EncryptionHeader_CipherTextMismatch") + .detail("TextDomainId", textCipherKey->getDomainId()) + .detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId) + .detail("TextBaseCipherId", textCipherKey->getBaseCipherId()) + .detail("ExpectedTextBaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("TextSalt", textCipherKey->getSalt()) + .detail("ExpectedTextSalt", header.cipherTextDetails.salt); + throw encrypt_header_metadata_mismatch(); + } + } + + ACTOR static Future decryptImpl(Database cx, + StringRef headerS, + const uint8_t* dataP, + int64_t dataLen, + Arena* arena) { + Reference const> dbInfo = cx->clientInfo; + state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS); + TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP)); + ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid()); + validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header); + DecryptBlobCipherAes256Ctr decryptor( + cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header.iv, BlobCipherMetrics::BACKUP); + return decryptor.decrypt(dataP, dataLen, header, *arena)->toStringRef(); + } + + static Future decrypt(Database cx, + StringRef headerS, + const uint8_t* dataP, + int64_t dataLen, + Arena* arena) { + return decryptImpl(cx, headerS, dataP, dataLen, arena); + } + + ACTOR static Future> refreshKey(EncryptedRangeFileWriter* self, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName) { + Reference const> dbInfo = self->cx->clientInfo; + TextAndHeaderCipherKeys cipherKeys = + wait(getLatestEncryptCipherKeysForDomain(dbInfo, domainId, domainName, BlobCipherMetrics::BACKUP)); + return cipherKeys.cipherTextKey; + } + + ACTOR static Future encrypt(EncryptedRangeFileWriter* self) { + ASSERT(self->cipherKeys.headerCipherKey.isValid() && self->cipherKeys.textCipherKey.isValid()); + // Ensure that the keys we got are still valid before flushing the block + if (self->cipherKeys.headerCipherKey->isExpired() || self->cipherKeys.headerCipherKey->needsRefresh()) { + Reference cipherKey = + wait(refreshKey(self, self->cipherKeys.headerCipherKey->getDomainId(), FDB_ENCRYPT_HEADER_DOMAIN_NAME)); + self->cipherKeys.headerCipherKey = cipherKey; + } + if (self->cipherKeys.textCipherKey->isExpired() || self->cipherKeys.textCipherKey->needsRefresh()) { + Reference cipherKey = + wait(refreshKey(self, self->cipherKeys.textCipherKey->getDomainId(), self->cipherKeys.textDomain)); + self->cipherKeys.textCipherKey = cipherKey; + } + EncryptBlobCipherAes265Ctr encryptor(self->cipherKeys.textCipherKey, + self->cipherKeys.headerCipherKey, + self->cipherKeys.ivRef.begin(), + AES_256_IV_LENGTH, + ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, + BlobCipherMetrics::BACKUP); + Arena arena; + int64_t payloadSize = self->wPtr - self->dataPayloadStart; + auto encryptedData = encryptor.encrypt(self->dataPayloadStart, payloadSize, self->encryptHeader, arena); + + // re-write encrypted data to buffer + std::memcpy(self->dataPayloadStart, encryptedData->begin(), payloadSize); + return Void(); + } + + ACTOR static Future updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, + KeyRef key, + Reference> cache) { + state std::pair curTenantInfo = wait(getEncryptionDomainDetails(key, cache)); + state Reference const> dbInfo = self->cx->clientInfo; + + // Get text and header cipher key + TextAndHeaderCipherKeys textAndHeaderCipherKeys = wait(getLatestEncryptCipherKeysForDomain( + dbInfo, curTenantInfo.first, curTenantInfo.second, BlobCipherMetrics::BACKUP)); + self->cipherKeys.textCipherKey = textAndHeaderCipherKeys.cipherTextKey; + self->cipherKeys.textDomain = curTenantInfo.second; + self->cipherKeys.headerCipherKey = textAndHeaderCipherKeys.cipherHeaderKey; + + // Set ivRef + self->cipherKeys.ivRef = makeString(AES_256_IV_LENGTH, *self->arena); + deterministicRandom()->randomBytes(mutateString(self->cipherKeys.ivRef), AES_256_IV_LENGTH); + return Void(); + } + + // Returns the number of bytes that have been written to the buffer + static int64_t currentBufferSize(EncryptedRangeFileWriter* self) { return self->wPtr - self->buffer.begin(); } + + static int64_t expectedFileSize(EncryptedRangeFileWriter* self) { + // Return what has already been written to file plus the size of the current buffer + // which indicates how many bytes the file will contain once the buffer is written + return self->file->size() + currentBufferSize(self); + } + + static void copyToBuffer(EncryptedRangeFileWriter* self, const void* src, size_t size) { + if (size > 0) { + std::memcpy(self->wPtr, src, size); + self->wPtr += size; + ASSERT(currentBufferSize(self) <= self->blockSize); + } + } + + static void appendStringRefWithLenToBuffer(EncryptedRangeFileWriter* self, StringRef* s) { + // Append the string length followed by the string to the buffer + uint32_t lenBuf = bigEndian32((uint32_t)s->size()); + copyToBuffer(self, &lenBuf, sizeof(lenBuf)); + copyToBuffer(self, s->begin(), s->size()); + } + + static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; } + + ACTOR static Future> getEncryptionDomainDetailsImpl( + KeyRef key, + Reference> tenantCache) { + if (isSystemKey(key)) { + return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); + } + if (key.size() < TENANT_PREFIX_SIZE) { + return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } + KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE); + state int64_t tenantId = TenantMapEntry::prefixToId(tenantPrefix); + Optional> payload = wait(tenantCache->getById(tenantId)); + if (payload.present()) { + return std::make_pair(tenantId, payload.get().name); + } + return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } + + static Future> getEncryptionDomainDetails( + KeyRef key, + Reference> tenantCache) { + return getEncryptionDomainDetailsImpl(key, tenantCache); + } + + // Handles the first block and internal blocks. Ends current block if needed. + // The final flag is used in simulation to pad the file's final block to a whole block size + ACTOR static Future newBlock(EncryptedRangeFileWriter* self, + int bytesNeeded, + KeyRef lastKey, + bool writeValue, + bool final = false) { + // Write padding to finish current block if needed + int bytesLeft = self->blockEnd - expectedFileSize(self); + ASSERT(bytesLeft >= 0); + if (bytesLeft > 0) { + state Value paddingFFs = makePadding(bytesLeft); + copyToBuffer(self, paddingFFs.begin(), bytesLeft); + } + + if (expectedFileSize(self) > 0) { + // write buffer to file since block is finished + ASSERT(currentBufferSize(self) == self->blockSize); + wait(encrypt(self)); + wait(self->file->append(self->buffer.begin(), self->blockSize)); + + // reset write pointer to beginning of StringRef + self->wPtr = mutateString(self->buffer); + } + + if (final) { + ASSERT(g_network->isSimulated()); + return Void(); + } + + // Set new blockEnd + self->blockEnd += self->blockSize; + + // write Header + copyToBuffer(self, (uint8_t*)&self->fileVersion, sizeof(self->fileVersion)); + + // write options struct + Value serialized = + ObjectWriter::toValue(self->options, IncludeVersion(ProtocolVersion::withEncryptedSnapshotBackupFile())); + appendStringRefWithLenToBuffer(self, &serialized); + + // leave space for encryption header + self->encryptHeader = (BlobCipherEncryptHeader*)self->wPtr; + self->wPtr += BlobCipherEncryptHeader::headerSize; + self->dataPayloadStart = self->wPtr; + + // If this is NOT the first block then write duplicate stuff needed from last block + if (self->blockEnd > self->blockSize) { + appendStringRefWithLenToBuffer(self, &lastKey); + appendStringRefWithLenToBuffer(self, &self->lastKey); + if (writeValue) { + appendStringRefWithLenToBuffer(self, &self->lastValue); + } + } + + // There must now be room in the current block for bytesNeeded or the block size is too small + if (expectedFileSize(self) + bytesNeeded > self->blockEnd) { + throw backup_bad_block_size(); + } + + return Void(); + } + + Future padEnd(bool final) { + if (expectedFileSize(this) > 0) { + return newBlock(this, 0, StringRef(), true, final); + } + return Void(); + } + + // Ends the current block if necessary based on bytesNeeded. + ACTOR static Future newBlockIfNeeded(EncryptedRangeFileWriter* self, int bytesNeeded) { + if (expectedFileSize(self) + bytesNeeded > self->blockEnd) { + wait(newBlock(self, bytesNeeded, self->lastKey, true)); + } + return Void(); + } + + ACTOR static Future handleTenantBondary(EncryptedRangeFileWriter* self, + Key k, + Value v, + bool writeValue, + std::pair curKeyTenantInfo) { + state KeyRef endKey = k; + // If we are crossing a boundary with a key that has a tenant prefix then truncate it + if (curKeyTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && + curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { + endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE); + } + state ValueRef newValue = StringRef(); + self->lastKey = k; + self->lastValue = v; + appendStringRefWithLenToBuffer(self, &endKey); + appendStringRefWithLenToBuffer(self, &newValue); + wait(newBlock(self, 0, endKey, writeValue)); + wait(updateEncryptionKeysCtx(self, self->lastKey, self->tenantCache)); + return Void(); + } + + ACTOR static Future finishCurTenantBlockStartNewIfNeeded(EncryptedRangeFileWriter* self, + Key k, + Value v, + bool writeValue) { + // Don't want to start a new block if the current key or previous key is empty + if (self->lastKey.size() == 0 || k.size() == 0) { + return false; + } + state std::pair curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache)); + state std::pair prevKeyTenantInfo = + wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache)); + // crossing tenant boundaries so finish the current block using only the tenant prefix of the new key + if (curKeyTenantInfo.first != prevKeyTenantInfo.first) { + CODE_PROBE(true, "crossed tenant boundaries"); + wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo)); + return true; + } + return false; + } + + // Start a new block if needed, then write the key and value + ACTOR static Future writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) { + if (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid()) { + wait(updateEncryptionKeysCtx(self, k, self->tenantCache)); + } + state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); + wait(newBlockIfNeeded(self, toWrite)); + bool createdNewBlock = wait(finishCurTenantBlockStartNewIfNeeded(self, k, v, true)); + if (createdNewBlock) { + return Void(); + } + appendStringRefWithLenToBuffer(self, &k); + appendStringRefWithLenToBuffer(self, &v); + self->lastKey = k; + self->lastValue = v; + return Void(); + } + + Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } + + // Write begin key or end key. + ACTOR static Future writeKey_impl(EncryptedRangeFileWriter* self, Key k) { + // TODO (Nim): Is it possible to write empty begin and end keys? + if (k.size() > 0 && + (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid())) { + wait(updateEncryptionKeysCtx(self, k, self->tenantCache)); + } + + // Need to account for extra "empty" value being written in the case of crossing tenant boundaries + int toWrite = sizeof(uint32_t) + k.size() + sizeof(uint32_t); + wait(newBlockIfNeeded(self, toWrite)); + bool createdNewBlock = wait(finishCurTenantBlockStartNewIfNeeded(self, k, StringRef(), false)); + if (createdNewBlock) { + return Void(); + } + appendStringRefWithLenToBuffer(self, &k); + self->lastKey = k; + return Void(); + } + + Future writeKey(Key k) { return writeKey_impl(this, k); } + + ACTOR static Future finish_impl(EncryptedRangeFileWriter* self) { + // Write any outstanding bytes to the file + if (currentBufferSize(self) > 0) { + wait(encrypt(self)); + wait(self->file->append(self->buffer.begin(), currentBufferSize(self))); + } + return Void(); + } + + Future finish() { return finish_impl(this); } + + Database cx; + Arena* arena; + Reference> tenantCache; + Reference file; + int blockSize; + +private: + Standalone buffer; + uint8_t* wPtr; + BlobCipherEncryptHeader* encryptHeader; + uint8_t* dataPayloadStart; + int64_t blockEnd; + uint32_t fileVersion; + Options options; + Key lastKey; + Key lastValue; + SnapshotFileBackupEncryptionKeys cipherKeys; +}; + +// File Format handlers. +// Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary // so they can be read in parallel. // // Writer instances must be kept alive while any member actors are in progress. @@ -468,6 +919,7 @@ Value makePadding(int size) { // 1 - writeKey(key) the queried key range begin // 2 - writeKV(k, v) each kv pair to restore // 3 - writeKey(key) the queried key range end +// 4 - finish() // // RangeFileWriter will insert the required padding, header, and extra // end/begin keys around the 1MB boundaries as needed. @@ -490,7 +942,7 @@ Value makePadding(int size) { // if the next KV pair wouldn't fit within the block after the value // then the space after the final key to the next 1MB boundary would // just be padding anyway. -struct RangeFileWriter { +struct RangeFileWriter : public IRangeFileWriter { RangeFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {} @@ -530,10 +982,10 @@ struct RangeFileWriter { } // Used in simulation only to create backup file sizes which are an integer multiple of the block size - Future padEnd() { + Future padEnd(bool final) { ASSERT(g_network->isSimulated()); if (file->size() > 0) { - return newBlock(this, 0, true); + return newBlock(this, 0, final); } return Void(); } @@ -568,6 +1020,8 @@ struct RangeFileWriter { Future writeKey(Key k) { return writeKey_impl(this, k); } + Future finish() { return Void(); } + Reference file; int blockSize; @@ -578,9 +1032,49 @@ private: Key lastValue; }; +ACTOR static Future decodeKVPairs(StringRefReader* reader, + Standalone>* results, + bool encryptedBlock, + Optional cx, + Reference> tenantCache) { + // Read begin key, if this fails then block was invalid. + state uint32_t kLen = reader->consumeNetworkUInt32(); + state const uint8_t* k = reader->consume(kLen); + results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + + // Read kv pairs and end key + while (1) { + // Read a key. + kLen = reader->consumeNetworkUInt32(); + k = reader->consume(kLen); + + // If eof reached or first value len byte is 0xFF then a valid block end was reached. + if (reader->eof() || *reader->rptr == 0xFF) { + results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + break; + } + + // Read a value, which must exist or the block is invalid + uint32_t vLen = reader->consumeNetworkUInt32(); + const uint8_t* v = reader->consume(vLen); + results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); + + // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. + if (reader->eof() || *reader->rptr == 0xFF) + break; + } + + // Make sure any remaining bytes in the block are 0xFF + for (auto b : reader->remainder()) + if (b != 0xFF) + throw restore_corrupted_data_padding(); + return Void(); +} + ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, - int len) { + int len, + Optional cx) { state Standalone buf = makeString(len); int rLen = wait(file->read(mutateString(buf), len, offset)); if (rLen != len) @@ -588,48 +1082,44 @@ ACTOR Future>> decodeRangeFileBlock(Reference< simulateBlobFailure(); - Standalone> results({}, buf.arena()); + state Standalone> results({}, buf.arena()); state StringRefReader reader(buf, restore_corrupted_data()); + state Arena arena; try { - // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION - if (reader.consume() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION) + // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION or + // BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION + int32_t file_version = reader.consume(); + if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) { + wait(decodeKVPairs(&reader, &results, false, cx, Reference>())); + } else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { + CODE_PROBE(true, "decoding encrypted block"); + ASSERT(cx.present()); + // decode options struct + uint32_t optionsLen = reader.consumeNetworkUInt32(); + const uint8_t* o = reader.consume(optionsLen); + StringRef optionsStringRef = StringRef(o, optionsLen); + EncryptedRangeFileWriter::Options options = + ObjectReader::fromStringRef(optionsStringRef, IncludeVersion()); + ASSERT(!options.compressionEnabled); + + // read encryption header + const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize); + StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize); + const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize; + // calculate the total bytes read up to (and including) the header + int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize; + // get the size of the encrypted payload and decrypt it + int64_t dataLen = len - bytesRead; + StringRef decryptedData = + wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena())); + reader = StringRefReader(decryptedData, restore_corrupted_data()); + Reference> tenantCache = makeReference>(cx.get()); + wait(decodeKVPairs(&reader, &results, true, cx, tenantCache)); + } else { throw restore_unsupported_file_version(); - - // Read begin key, if this fails then block was invalid. - uint32_t kLen = reader.consumeNetworkUInt32(); - const uint8_t* k = reader.consume(kLen); - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); - - // Read kv pairs and end key - while (1) { - // Read a key. - kLen = reader.consumeNetworkUInt32(); - k = reader.consume(kLen); - - // If eof reached or first value len byte is 0xFF then a valid block end was reached. - if (reader.eof() || *reader.rptr == 0xFF) { - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); - break; - } - - // Read a value, which must exist or the block is invalid - uint32_t vLen = reader.consumeNetworkUInt32(); - const uint8_t* v = reader.consume(vLen); - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); - - // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. - if (reader.eof() || *reader.rptr == 0xFF) - break; } - - // Make sure any remaining bytes in the block are 0xFF - for (auto b : reader.remainder()) - if (b != 0xFF) - throw restore_corrupted_data_padding(); - return results; - } catch (Error& e) { TraceEvent(SevWarn, "FileRestoreDecodeRangeFileBlockFailed") .error(e) @@ -775,8 +1265,7 @@ ACTOR static Future abortFiveZeroBackup(FileBackupAgent* backupAgent, state Subspace statusSpace = backupAgent->subspace.get(BackupAgentBase::keyStates).get(uid.toString()); state Subspace globalConfig = backupAgent->subspace.get(BackupAgentBase::keyConfig).get(uid.toString()); - state Subspace newConfigSpace = - uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(fileBackupPrefixRange.begin), uid); + state Subspace newConfigSpace = uidPrefixKey("uid->config/"_sr.withPrefix(fileBackupPrefixRange.begin), uid); Optional statusStr = wait(tr->get(statusSpace.pack(FileBackupAgent::keyStateStatus))); state EBackupState status = @@ -816,8 +1305,6 @@ struct AbortFiveZeroBackupTask : TaskFuncBase { state FileBackupAgent backupAgent; state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString(); - CODE_PROBE(true, "Canceling old backup task"); - TraceEvent(SevInfo, "FileBackupCancelOldTask") .detail("Task", task->params[Task::reservedTaskParamKeyType]) .detail("TagName", tagName); @@ -847,7 +1334,7 @@ struct AbortFiveZeroBackupTask : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef AbortFiveZeroBackupTask::name = LiteralStringRef("abort_legacy_backup"); +StringRef AbortFiveZeroBackupTask::name = "abort_legacy_backup"_sr; REGISTER_TASKFUNC(AbortFiveZeroBackupTask); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_diff_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_log_range); @@ -902,8 +1389,6 @@ struct AbortFiveOneBackupTask : TaskFuncBase { state BackupConfig config(task); state std::string tagName = wait(config.tag().getOrThrow(tr)); - CODE_PROBE(true, "Canceling 5.1 backup task"); - TraceEvent(SevInfo, "FileBackupCancelFiveOneTask") .detail("Task", task->params[Task::reservedTaskParamKeyType]) .detail("TagName", tagName); @@ -933,7 +1418,7 @@ struct AbortFiveOneBackupTask : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef AbortFiveOneBackupTask::name = LiteralStringRef("abort_legacy_backup_5.2"); +StringRef AbortFiveOneBackupTask::name = "abort_legacy_backup_5.2"_sr; REGISTER_TASKFUNC(AbortFiveOneBackupTask); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_range); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_ranges); @@ -972,7 +1457,7 @@ ACTOR static Future addBackupTask(StringRef name, } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } // Clears the backup ID from "backupStartedKey" to pause backup workers. @@ -1044,9 +1529,9 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam beginKey() { return LiteralStringRef(__FUNCTION__); } - static TaskParam endKey() { return LiteralStringRef(__FUNCTION__); } - static TaskParam addBackupRangeTasks() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginKey() { return __FUNCTION__sr; } + static TaskParam endKey() { return __FUNCTION__sr; } + static TaskParam addBackupRangeTasks() { return __FUNCTION__sr; } } Params; std::string toString(Reference task) const override { @@ -1071,8 +1556,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; - // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress durable. - // This means: + // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress + // durable. This means: // - increment the backup config's range bytes written // - update the range file map // - update the task begin key @@ -1180,8 +1665,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { .detail("EndKey", Params.endKey().get(task).printable()) .detail("TaskKey", task->key.printable()); - // When a key range task saves the last chunk of progress and then the executor dies, when the task continues - // its beginKey and endKey will be equal but there is no work to be done. + // When a key range task saves the last chunk of progress and then the executor dies, when the task + // continues its beginKey and endKey will be equal but there is no work to be done. if (beginKey == endKey) return Void(); @@ -1194,8 +1679,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { } // Read everything from beginKey to endKey, write it to an output file, run the output file processor, and - // then set on_done. If we are still writing after X seconds, end the output file and insert a new backup_range - // task for the remainder. + // then set on_done. If we are still writing after X seconds, end the output file and insert a new + // backup_range task for the remainder. state Reference outFile; state Version outVersion = invalidVersion; state Key lastKey; @@ -1210,11 +1695,13 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { Terminator::True, AccessSystemKeys::True, LockAware::True); - state RangeFileWriter rangeFile; + state std::unique_ptr rangeFile; state BackupConfig backup(task); + state Arena arena; + state Reference> tenantCache = makeReference>(cx); - // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but if - // bc is false then clearly the backup is no longer in progress + // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but + // if bc is false then clearly the backup is no longer in progress state Reference bc = wait(backup.backupContainer().getD(cx.getReference())); if (!bc) { return Void(); @@ -1222,6 +1709,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { state bool done = false; state int64_t nrKeys = 0; + state bool encryptionEnabled = false; loop { state RangeResultWithVersion values; @@ -1236,17 +1724,20 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { throw; } - // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish it. + // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish + // it. if (values.second != outVersion || done) { if (outFile) { CODE_PROBE(outVersion != invalidVersion, "Backup range task wrote multiple versions"); state Key nextKey = done ? endKey : keyAfter(lastKey); - wait(rangeFile.writeKey(nextKey)); + wait(rangeFile->writeKey(nextKey)); if (BUGGIFY) { - wait(rangeFile.padEnd()); + wait(rangeFile->padEnd(true)); } + wait(rangeFile->finish()); + bool usedFile = wait( finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion)); TraceEvent("FileBackupWroteRangeFile") @@ -1269,8 +1760,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { // Start writing a new file after verifying this task should keep running as of a new read version // (which must be >= outVersion) outVersion = values.second; - // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so 250k - // conservatively. + // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so + // 250k conservatively. state int blockSize = BUGGIFY ? deterministicRandom()->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; state Version snapshotBeginVersion; @@ -1284,6 +1775,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { wait(taskBucket->keepRunning(tr, task) && storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) && + storeOrThrow(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) && store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr))); break; @@ -1296,16 +1788,22 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); outFile = f; + encryptionEnabled = encryptionEnabled && cx->clientInfo->get().isEncryptionEnabled; // Initialize range file writer and write begin key - rangeFile = RangeFileWriter(outFile, blockSize); - wait(rangeFile.writeKey(beginKey)); + if (encryptionEnabled) { + CODE_PROBE(true, "using encrypted snapshot file writer"); + rangeFile = std::make_unique(cx, &arena, tenantCache, outFile, blockSize); + } else { + rangeFile = std::make_unique(outFile, blockSize); + } + wait(rangeFile->writeKey(beginKey)); } // write kvData to file, update lastKey and key count if (values.first.size() != 0) { state size_t i = 0; for (; i < values.first.size(); ++i) { - wait(rangeFile.writeKV(values.first[i].key, values.first[i].value)); + wait(rangeFile->writeKV(values.first[i].key, values.first[i].value)); } lastKey = values.first.back().key; nrKeys += values.first.size(); @@ -1370,7 +1868,6 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { Reference futureBucket, Reference task) { state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - if (Params.addBackupRangeTasks().get(task)) { wait(startBackupRangeInternal(tr, taskBucket, futureBucket, task, taskFuture)); } else { @@ -1389,7 +1886,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { return Void(); } }; -StringRef BackupRangeTaskFunc::name = LiteralStringRef("file_backup_write_range_5.2"); +StringRef BackupRangeTaskFunc::name = "file_backup_write_range_5.2"_sr; REGISTER_TASKFUNC(BackupRangeTaskFunc); struct BackupSnapshotDispatchTask : BackupTaskFuncBase { @@ -1398,11 +1895,11 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { static struct { // Set by Execute, used by Finish - static TaskParam shardsBehind() { return LiteralStringRef(__FUNCTION__); } + static TaskParam shardsBehind() { return __FUNCTION__sr; } // Set by Execute, used by Finish - static TaskParam snapshotFinished() { return LiteralStringRef(__FUNCTION__); } + static TaskParam snapshotFinished() { return __FUNCTION__sr; } // Set by Execute, used by Finish - static TaskParam nextDispatchVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam nextDispatchVersion() { return __FUNCTION__sr; } } Params; StringRef getName() const override { return name; }; @@ -1455,12 +1952,12 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state double startTime = timer(); state Reference tr(new ReadYourWritesTransaction(cx)); - // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN which - // will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which are not - // yet finished to enable efficiently finding random database shards which are not done. + // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN + // which will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which + // are not yet finished to enable efficiently finding random database shards which are not done. state int notDoneSequence = NOT_DONE_MIN; - state KeyRangeMap shardMap(notDoneSequence++, normalKeys.end); - state Key beginKey = normalKeys.begin; + state KeyRangeMap shardMap(notDoneSequence++); + state Key beginKey = allKeys.begin; // Read all shard boundaries and add them to the map loop { @@ -1469,7 +1966,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future>> shardBoundaries = - getBlockOfShards(tr, beginKey, normalKeys.end, CLIENT_KNOBS->TOO_MANY); + getBlockOfShards(tr, beginKey, allKeys.end, CLIENT_KNOBS->TOO_MANY); wait(success(shardBoundaries) && taskBucket->keepRunning(tr, task)); if (shardBoundaries.get().size() == 0) @@ -1514,7 +2011,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) && store(recentReadVersion, tr->getReadVersion()) && taskBucket->keepRunning(tr, task)); - // If the snapshot batch future key does not exist, this is the first execution of this dispatch task so + // If the snapshot batch future key does not exist, this is the first execution of this dispatch + // task so // - create and set the snapshot batch future key // - initialize the batch size to 0 // - initialize the target snapshot end version if it is not yet set @@ -1526,7 +2024,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { config.snapshotBatchSize().set(tr, snapshotBatchSize.get()); // The dispatch of this batch can take multiple separate executions if the executor fails - // so store a completion key for the dispatch finish() to set when dispatching the batch is done. + // so store a completion key for the dispatch finish() to set when dispatching the batch is + // done. state TaskCompletionKey dispatchCompletionKey = TaskCompletionKey::joinWith(snapshotBatchFuture); // this is a bad hack - but flow doesn't work well with lambda functions and caputring // state variables... @@ -1552,7 +2051,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { // Read all dispatched ranges state std::vector> dispatchBoundaries; tr->reset(); - beginKey = normalKeys.begin; + beginKey = allKeys.begin; loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -1560,7 +2059,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state Future bounds = config.snapshotRangeDispatchMap().getRange( - tr, beginKey, keyAfter(normalKeys.end), CLIENT_KNOBS->TOO_MANY); + tr, beginKey, keyAfter(allKeys.end), CLIENT_KNOBS->TOO_MANY); wait(success(bounds) && taskBucket->keepRunning(tr, task) && store(recentReadVersion, tr->getReadVersion())); @@ -1601,8 +2100,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { // If this was the end of a dispatched range if (!boundary.second) { - // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range to - // DONE. + // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range + // to DONE. RangeMap::Ranges shardRanges = shardMap.modify(KeyRangeRef(lastKey, boundary.first)); iShard = shardRanges.begin(); @@ -1623,7 +2122,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { // Set anything outside the backup ranges to SKIP. We can use insert() here instead of modify() // because it's OK to delete shard boundaries in the skipped ranges. if (backupRanges.size() > 0) { - shardMap.insert(KeyRangeRef(normalKeys.begin, backupRanges.front().begin), SKIP); + shardMap.insert(KeyRangeRef(allKeys.begin, backupRanges.front().begin), SKIP); wait(yield()); for (i = 0; i < backupRanges.size() - 1; ++i) { @@ -1631,7 +2130,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { wait(yield()); } - shardMap.insert(KeyRangeRef(backupRanges.back().end, normalKeys.end), SKIP); + shardMap.insert(KeyRangeRef(backupRanges.back().end, allKeys.end), SKIP); wait(yield()); } @@ -1652,7 +2151,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { } // Coalesce the shard map to make random selection below more efficient. - shardMap.coalesce(normalKeys); + shardMap.coalesce(allKeys); wait(yield()); // In this context "all" refers to all of the shards relevant for this particular backup @@ -1684,10 +2183,10 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * CLIENT_KNOBS->BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC; - // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just use - // the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this dispatch will - // be scheduled at a random time between recentReadVersion and nextDispatchVersion, - // so nextDispatchVersion shouldn't be less than recentReadVersion. + // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just + // use the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this + // dispatch will be scheduled at a random time between recentReadVersion and nextDispatchVersion, so + // nextDispatchVersion shouldn't be less than recentReadVersion. if (nextDispatchVersion > snapshotTargetEndVersion) nextDispatchVersion = std::max(recentReadVersion, snapshotTargetEndVersion); @@ -1707,12 +2206,12 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state int countShardsToDispatch = std::max(0, countExpectedShardsDone - countShardsDone); // Calculate the number of shards that would have been dispatched by a normal (on-schedule) - // BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the current - // snapshot. + // BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the + // current snapshot. int64_t dispatchWindow = nextDispatchVersion - recentReadVersion; - // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all shards - // are considered late + // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all + // shards are considered late int countShardsExpectedPerNormalWindow; if (snapshotScheduledVersionInterval == 0) { countShardsExpectedPerNormalWindow = 0; @@ -1723,8 +2222,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { (double(dispatchWindow) / snapshotScheduledVersionInterval) * countAllShards; } - // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are being - // dispatched, if any. + // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are + // being dispatched, if any. int countShardsBehind = std::max(0, countShardsToDispatch + snapshotBatchSize.get() - countShardsExpectedPerNormalWindow); Params.shardsBehind().set(task, countShardsBehind); @@ -1802,8 +2301,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { wait(store(snapshotBatchSize.get(), config.snapshotBatchSize().getOrThrow(tr)) && waitForAll(beginReads) && waitForAll(endReads) && taskBucket->keepRunning(tr, task)); - // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction is - // already done. + // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction + // is already done. if (snapshotBatchSize.get() == newBatchSize) { break; } else { @@ -1841,8 +2340,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { } Version scheduledVersion = invalidVersion; - // If the next dispatch version is in the future, choose a random version at which to start - // the new task. + // If the next dispatch version is in the future, choose a random version at which to + // start the new task. if (nextDispatchVersion > recentReadVersion) scheduledVersion = recentReadVersion + deterministicRandom()->random01() * (nextDispatchVersion - recentReadVersion); @@ -1868,8 +2367,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { .detail("BeginKey", range.begin.printable()) .detail("EndKey", range.end.printable()); } else { - // This shouldn't happen because if the transaction was already done or if another execution - // of this task is making progress it should have been detected above. + // This shouldn't happen because if the transaction was already done or if another + // execution of this task is making progress it should have been detected above. ASSERT(false); } } @@ -1901,9 +2400,9 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { } // This function is just a wrapper for BackupSnapshotManifest::addTask() which is defined below. - // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep their - // execute and finish phases defined together inside their class definitions this wrapper is declared here but - // defined after BackupSnapshotManifest is defined. + // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep + // their execute and finish phases defined together inside their class definitions this wrapper is declared here + // but defined after BackupSnapshotManifest is defined. static Future addSnapshotManifestTask(Reference tr, Reference taskBucket, Reference parentTask, @@ -1936,9 +2435,9 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state Reference snapshotFinishedFuture = task->getDoneFuture(futureBucket); - // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another snapshot - // dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done key, passed to - // the current task, is also passed on. + // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another + // snapshot dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done + // key, passed to the current task, is also passed on. if (Params.snapshotFinished().getOrDefault(task, false)) { wait(success(addSnapshotManifestTask( tr, taskBucket, task, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture))); @@ -1960,7 +2459,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { return Void(); } }; -StringRef BackupSnapshotDispatchTask::name = LiteralStringRef("file_backup_dispatch_ranges_5.2"); +StringRef BackupSnapshotDispatchTask::name = "file_backup_dispatch_ranges_5.2"_sr; REGISTER_TASKFUNC(BackupSnapshotDispatchTask); struct BackupLogRangeTaskFunc : BackupTaskFuncBase { @@ -1968,10 +2467,10 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam addBackupLogRangeTasks() { return LiteralStringRef(__FUNCTION__); } - static TaskParam fileSize() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam addBackupLogRangeTasks() { return __FUNCTION__sr; } + static TaskParam fileSize() { return __FUNCTION__sr; } + static TaskParam beginVersion() { return __FUNCTION__sr; } + static TaskParam endVersion() { return __FUNCTION__sr; } } Params; StringRef getName() const override { return name; }; @@ -2032,10 +2531,10 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); // Get the set of key ranges that hold mutations for (beginVersion, endVersion). They will be queried in - // parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if that - // limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the finish() - // step to break up the (beginVersion, endVersion) range into smaller intervals which are then processed by - // individual BackupLogRangeTasks. + // parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if + // that limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the + // finish() step to break up the (beginVersion, endVersion) range into smaller intervals which are then + // processed by individual BackupLogRangeTasks. state Standalone> ranges = getLogRanges(beginVersion, endVersion, destUidValue); if (ranges.size() > CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) { Params.addBackupLogRangeTasks().set(task, true); @@ -2049,9 +2548,9 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { state Reference outFile = wait(bc->writeLogFile(beginVersion, endVersion, blockSize)); state LogFileWriter logFile(outFile, blockSize); - // Query all key ranges covering (beginVersion, endVersion) in parallel, writing their results to the results - // promise stream as they are received. Note that this means the records read from the results stream are not - // likely to be in increasing Version order. + // Query all key ranges covering (beginVersion, endVersion) in parallel, writing their results to the + // results promise stream as they are received. Note that this means the records read from the results + // stream are not likely to be in increasing Version order. state PromiseStream results; state std::vector> rc; @@ -2060,7 +2559,7 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { readCommitted(cx, results, lock, range, Terminator::False, AccessSystemKeys::True, LockAware::True)); } - state Future sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr const& result) { + state Future sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr const& result) mutable { if (result.isError()) results.sendError(result.getError()); else @@ -2199,7 +2698,7 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { } }; -StringRef BackupLogRangeTaskFunc::name = LiteralStringRef("file_backup_write_logs_5.2"); +StringRef BackupLogRangeTaskFunc::name = "file_backup_write_logs_5.2"_sr; REGISTER_TASKFUNC(BackupLogRangeTaskFunc); // This task stopped being used in 6.2, however the code remains here to handle upgrades. @@ -2209,9 +2708,9 @@ struct EraseLogRangeTaskFunc : BackupTaskFuncBase { StringRef getName() const override { return name; }; static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam destUidValue() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return __FUNCTION__sr; } + static TaskParam endVersion() { return __FUNCTION__sr; } + static TaskParam destUidValue() { return __FUNCTION__sr; } } Params; ACTOR static Future addTask(Reference tr, @@ -2230,7 +2729,8 @@ struct EraseLogRangeTaskFunc : BackupTaskFuncBase { BackupConfig(logUid), waitFor, [=](Reference task) { - Params.beginVersion().set(task, 1); // FIXME: remove in 6.X, only needed for 5.2 backward compatibility + Params.beginVersion().set(task, + 1); // FIXME: remove in 6.X, only needed for 5.2 backward compatibility Params.endVersion().set(task, endVersion); Params.destUidValue().set(task, destUidValue); }, @@ -2274,7 +2774,7 @@ struct EraseLogRangeTaskFunc : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("file_backup_erase_logs_5.2"); +StringRef EraseLogRangeTaskFunc::name = "file_backup_erase_logs_5.2"_sr; REGISTER_TASKFUNC(EraseLogRangeTaskFunc); struct BackupLogsDispatchTask : BackupTaskFuncBase { @@ -2282,8 +2782,8 @@ struct BackupLogsDispatchTask : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam prevBeginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam prevBeginVersion() { return __FUNCTION__sr; } + static TaskParam beginVersion() { return __FUNCTION__sr; } } Params; ACTOR static Future _finish(Reference tr, @@ -2352,8 +2852,8 @@ struct BackupLogsDispatchTask : BackupTaskFuncBase { state int priority = latestSnapshotEndVersion.present() ? 1 : 0; if (!partitionedLog.present() || !partitionedLog.get()) { - // Add the initial log range task to read/copy the mutations and the next logs dispatch task which will run - // after this batch is done + // Add the initial log range task to read/copy the mutations and the next logs dispatch task which will + // run after this batch is done wait(success(BackupLogRangeTaskFunc::addTask(tr, taskBucket, task, @@ -2444,7 +2944,7 @@ struct BackupLogsDispatchTask : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef BackupLogsDispatchTask::name = LiteralStringRef("file_backup_dispatch_logs_5.2"); +StringRef BackupLogsDispatchTask::name = "file_backup_dispatch_logs_5.2"_sr; REGISTER_TASKFUNC(BackupLogsDispatchTask); struct FileBackupFinishedTask : BackupTaskFuncBase { @@ -2504,14 +3004,14 @@ struct FileBackupFinishedTask : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef FileBackupFinishedTask::name = LiteralStringRef("file_backup_finished_5.2"); +StringRef FileBackupFinishedTask::name = "file_backup_finished_5.2"_sr; REGISTER_TASKFUNC(FileBackupFinishedTask); struct BackupSnapshotManifest : BackupTaskFuncBase { static StringRef name; static constexpr uint32_t version = 1; static struct { - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam endVersion() { return __FUNCTION__sr; } } Params; ACTOR static Future _execute(Database cx, @@ -2523,8 +3023,8 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { state Reference tr(new ReadYourWritesTransaction(cx)); - // Read the entire range file map into memory, then walk it backwards from its last entry to produce a list of - // non overlapping key range files + // Read the entire range file map into memory, then walk it backwards from its last entry to produce a list + // of non overlapping key range files state std::map localmap; state Key startKey; state int batchSize = BUGGIFY ? 1 : 1000000; @@ -2588,10 +3088,11 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { totalBytes += r.fileSize; // Jump to file that either ends where this file begins or has the greatest end that is less than - // the begin of this file. In other words find the map key that is <= begin of this file. To do this - // find the first end strictly greater than begin and then back up one. + // the begin of this file. In other words find the map key that is <= begin of this file. To do + // this find the first end strictly greater than begin and then back up one. i = localmap.upper_bound(i->second.begin); - // If we get begin then we're done, there are no more ranges that end at or before the last file's begin + // If we get begin then we're done, there are no more ranges that end at or before the last file's + // begin if (i == localmap.begin()) break; --i; @@ -2693,7 +3194,7 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef BackupSnapshotManifest::name = LiteralStringRef("file_backup_write_snapshot_manifest_5.2"); +StringRef BackupSnapshotManifest::name = "file_backup_write_snapshot_manifest_5.2"_sr; REGISTER_TASKFUNC(BackupSnapshotManifest); Future BackupSnapshotDispatchTask::addSnapshotManifestTask(Reference tr, @@ -2709,7 +3210,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return __FUNCTION__sr; } } Params; ACTOR static Future _execute(Database cx, @@ -2839,8 +3340,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { wait(success(BackupLogsDispatchTask::addTask( tr, taskBucket, task, 1, 0, beginVersion, TaskCompletionKey::joinWith(backupFinished)))); - // If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then the - // following task will clean up and set the completed state. + // If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then + // the following task will clean up and set the completed state. wait(success( FileBackupFinishedTask::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal(), backupFinished))); @@ -2879,7 +3380,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef StartFullBackupTaskFunc::name = LiteralStringRef("file_backup_start_5.2"); +StringRef StartFullBackupTaskFunc::name = "file_backup_start_5.2"_sr; REGISTER_TASKFUNC(StartFullBackupTaskFunc); struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { @@ -2895,13 +3396,14 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { // Clear the file map now since it could be huge. restore.fileSet().clear(tr); - // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for any - // restore operation the ranges to restore must be within the backed up ranges, otherwise from the restore - // perspective it will appear that some key ranges were missing and so the backup set is incomplete and the - // restore has failed. This validation cannot be done currently because Restore only supports a single restore - // range but backups can have many ranges. + // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for + // any restore operation the ranges to restore must be within the backed up ranges, otherwise from the + // restore perspective it will appear that some key ranges were missing and so the backup set is incomplete + // and the restore has failed. This validation cannot be done currently because Restore only supports a + // single restore range but backups can have many ranges. - // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored version. + // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored + // version. restore.clearApplyMutationsKeys(tr); wait(taskBucket->finish(tr, task)); @@ -2926,7 +3428,7 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } static StringRef name; @@ -2946,14 +3448,14 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreCompleteTaskFunc::name = LiteralStringRef("restore_complete"); +StringRef RestoreCompleteTaskFunc::name = "restore_complete"_sr; REGISTER_TASKFUNC(RestoreCompleteTaskFunc); struct RestoreFileTaskFuncBase : RestoreTaskFuncBase { struct InputParams { - static TaskParam inputFile() { return LiteralStringRef(__FUNCTION__); } - static TaskParam readOffset() { return LiteralStringRef(__FUNCTION__); } - static TaskParam readLen() { return LiteralStringRef(__FUNCTION__); } + static TaskParam inputFile() { return __FUNCTION__sr; } + static TaskParam readOffset() { return __FUNCTION__sr; } + static TaskParam readLen() { return __FUNCTION__sr; } } Params; std::string toString(Reference task) const override { @@ -2968,8 +3470,8 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { static struct : InputParams { // The range of data that the (possibly empty) data represented, which is set if it intersects the target // restore range - static TaskParam originalFileRange() { return LiteralStringRef(__FUNCTION__); } - static TaskParam> originalFileRanges() { return LiteralStringRef(__FUNCTION__); } + static TaskParam originalFileRange() { return __FUNCTION__sr; } + static TaskParam> originalFileRanges() { return __FUNCTION__sr; } static std::vector getOriginalFileRanges(Reference task) { if (originalFileRanges().exists(task)) { @@ -3038,7 +3540,8 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { } state Reference inFile = wait(bc.get()->readFile(rangeFile.fileName)); - state Standalone> blockData = wait(decodeRangeFileBlock(inFile, readOffset, readLen)); + state Standalone> blockData = + wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx)); // First and last key are the range for this file state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key); @@ -3065,16 +3568,16 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { state VectorRef data = blockData.slice(rangeStart, rangeEnd); // Shrink file range to be entirely within restoreRange and translate it to the new prefix - // First, use the untranslated file range to create the shrunk original file range which must be used in the - // kv range version map for applying mutations + // First, use the untranslated file range to create the shrunk original file range which must be used in + // the kv range version map for applying mutations state KeyRange originalFileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end)); originalFileRanges.push_back(originalFileRange); // Now shrink and translate fileRange Key fileEnd = std::min(fileRange.end, restoreRange.end); - if (fileEnd == (removePrefix.get() == StringRef() ? normalKeys.end : strinc(removePrefix.get()))) { - fileEnd = addPrefix.get() == StringRef() ? normalKeys.end : strinc(addPrefix.get()); + if (fileEnd == (removePrefix.get() == StringRef() ? allKeys.end : strinc(removePrefix.get()))) { + fileEnd = addPrefix.get() == StringRef() ? allKeys.end : strinc(addPrefix.get()); } else { fileEnd = fileEnd.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()); } @@ -3112,7 +3615,6 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { : data[start].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()), (iend == end) ? fileRange.end : data[iend].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())); - tr->clear(trRange); for (; i < iend; ++i) { @@ -3218,7 +3720,7 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } static StringRef name; @@ -3238,7 +3740,7 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreRangeTaskFunc::name = LiteralStringRef("restore_range_data"); +StringRef RestoreRangeTaskFunc::name = "restore_range_data"_sr; REGISTER_TASKFUNC(RestoreRangeTaskFunc); // Decodes a mutation log key, which contains (hash, commitVersion, chunkNumber) and @@ -3333,6 +3835,14 @@ bool AccumulatedMutations::matchesAnyRange(const std::vector& ranges) std::vector mutations = decodeMutationLogValue(serializedMutations); for (auto& m : mutations) { for (auto& r : ranges) { + if (m.type == MutationRef::Encrypted) { + // TODO: In order to filter out encrypted mutations that are not relevant to the + // target range, they would have to be decrypted here in order to check relevance + // below, however the staged mutations would still need to remain encrypted for + // staging into the destination database. Without decrypting, we must assume that + // some data could match the range and return true here. + return true; + } if (m.type == MutationRef::ClearRange) { if (r.intersects(KeyRangeRef(m.param1, m.param2))) { return true; @@ -3428,8 +3938,8 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { state Standalone> dataOriginal = wait(decodeMutationLogFileBlock(inFile, readOffset, readLen)); - // Filter the KV pairs extracted from the log file block to remove any records known to not be needed for this - // restore based on the restore range set. + // Filter the KV pairs extracted from the log file block to remove any records known to not be needed for + // this restore based on the restore range set. state std::vector dataFiltered = filterLogMutationKVPairs(dataOriginal, ranges); state int start = 0; @@ -3503,8 +4013,8 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - // TODO: Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails) will - // never set its taskFuture. + // TODO: Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails) + // will never set its taskFuture. wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task)); return Void(); @@ -3532,7 +4042,7 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } Future execute(Database cx, @@ -3548,7 +4058,7 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreLogDataTaskFunc::name = LiteralStringRef("restore_log_data"); +StringRef RestoreLogDataTaskFunc::name = "restore_log_data"_sr; REGISTER_TASKFUNC(RestoreLogDataTaskFunc); struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { @@ -3557,11 +4067,11 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { StringRef getName() const override { return name; }; static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginFile() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginBlock() { return LiteralStringRef(__FUNCTION__); } - static TaskParam batchSize() { return LiteralStringRef(__FUNCTION__); } - static TaskParam remainingInBatch() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return __FUNCTION__sr; } + static TaskParam beginFile() { return __FUNCTION__sr; } + static TaskParam beginBlock() { return __FUNCTION__sr; } + static TaskParam batchSize() { return __FUNCTION__sr; } + static TaskParam remainingInBatch() { return __FUNCTION__sr; } } Params; ACTOR static Future _finish(Reference tr, @@ -3612,8 +4122,8 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { } state std::string beginFile = Params.beginFile().getOrDefault(task); - // Get a batch of files. We're targeting batchSize blocks being dispatched so query for batchSize files (each - // of which is 0 or more blocks). + // Get a batch of files. We're targeting batchSize blocks being dispatched so query for batchSize files + // (each of which is 0 or more blocks). state int taskBatchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_DISPATCH_ADDTASK_SIZE; state RestoreConfig::FileSetT::RangeResultType files = wait(restore.fileSet().getRange( tr, Optional({ beginVersion, beginFile }), {}, taskBatchSize)); @@ -3635,8 +4145,8 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { // If there were no files to load then this batch is done and restore is almost done. if (files.results.size() == 0) { - // If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits for - // them to finish + // If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits + // for them to finish if (addingToExistingBatch) { // Setting next begin to restoreVersion + 1 so that any files in the file map at the restore version // won't be dispatched again. @@ -3686,8 +4196,8 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { .detail("Decision", "restore_complete") .detail("TaskInstance", THIS_ADDR); } else { - // Applying of mutations is not yet finished so wait a small amount of time and then re-add this same - // task. + // Applying of mutations is not yet finished so wait a small amount of time and then re-add this + // same task. wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, beginVersion, "", 0, batchSize))); @@ -3720,9 +4230,9 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { for (; i < files.results.size(); ++i) { RestoreConfig::RestoreFile& f = files.results[i]; - // Here we are "between versions" (prior to adding the first block of the first file of a new version) so - // this is an opportunity to end the current dispatch batch (which must end on a version boundary) if the - // batch size has been reached or exceeded + // Here we are "between versions" (prior to adding the first block of the first file of a new version) + // so this is an opportunity to end the current dispatch batch (which must end on a version boundary) if + // the batch size has been reached or exceeded if (f.version != endVersion && remainingInBatch <= 0) { // Next start will be at the first version after endVersion at the first file first block ++endVersion; @@ -3784,13 +4294,13 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { .detail("TaskInstance", THIS_ADDR); } - // If no blocks were dispatched then the next dispatch task should run now and be joined with the allPartsDone - // future + // If no blocks were dispatched then the next dispatch task should run now and be joined with the + // allPartsDone future if (blocksDispatched == 0) { std::string decision; - // If no files were dispatched either then the batch size wasn't large enough to catch all of the files at - // the next lowest non-dispatched version, so increase the batch size. + // If no files were dispatched either then the batch size wasn't large enough to catch all of the files + // at the next lowest non-dispatched version, so increase the batch size. if (i == 0) { batchSize *= 2; decision = "increased_batch_size"; @@ -3833,13 +4343,13 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { restore.filesBlocksDispatched().atomicOp(tr, blocksDispatched, MutationRef::Type::AddValue); // If beginFile is not empty then we had to stop in the middle of a version (possibly within a file) so we - // cannot end the batch here because we do not know if we got all of the files and blocks from the last version - // queued, so make sure remainingInBatch is at least 1. + // cannot end the batch here because we do not know if we got all of the files and blocks from the last + // version queued, so make sure remainingInBatch is at least 1. if (!beginFile.empty()) remainingInBatch = std::max(1, remainingInBatch); - // If more blocks need to be dispatched in this batch then add a follow-on task that is part of the allPartsDone - // group which will won't wait to run and will add more block tasks. + // If more blocks need to be dispatched in this batch then add a follow-on task that is part of the + // allPartsDone group which will won't wait to run and will add more block tasks. if (remainingInBatch > 0) addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr, taskBucket, @@ -3916,7 +4426,7 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } Future execute(Database cx, @@ -3932,7 +4442,7 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreDispatchTaskFunc::name = LiteralStringRef("restore_dispatch"); +StringRef RestoreDispatchTaskFunc::name = "restore_dispatch"_sr; REGISTER_TASKFUNC(RestoreDispatchTaskFunc); ACTOR Future restoreStatus(Reference tr, Key tagName) { @@ -4030,7 +4540,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam firstVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam firstVersion() { return __FUNCTION__sr; } } Params; // Find all files needed for the restore and save them in the RestoreConfig for the task. @@ -4096,7 +4606,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { .detail("RestoreVersion", restoreVersion) .detail("Dest", destVersion); if (destVersion <= restoreVersion) { - CODE_PROBE(true, "Forcing restored cluster to higher version"); + CODE_PROBE(true, "Forcing restored cluster to higher version", probe::decoration::rare); tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion + 1, Unversioned())); wait(tr->commit()); } else { @@ -4116,7 +4626,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r)); } state Optional restorable = - wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion)); + wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter, logsOnly, beginVersion)); if (!restorable.present()) throw restore_missing_data(); @@ -4128,8 +4638,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { if (!inconsistentSnapshotOnly) { for (const RangeFile& f : restorable.get().ranges) { files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize }); - // In a restore with both snapshots and logs, the firstConsistentVersion is the highest version of - // any range file. + // In a restore with both snapshots and logs, the firstConsistentVersion is the highest version + // of any range file. firstConsistentVersion = std::max(firstConsistentVersion, f.version); } } else { @@ -4248,7 +4758,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { // If this is an incremental restore, we need to set the applyMutationsMapPrefix // to the earliest log version so no mutations are missed Value versionEncoded = BinaryWriter::toValue(Params.firstVersion().get(task), Unversioned()); - wait(krmSetRange(tr, restore.applyMutationsMapPrefix(), normalKeys, versionEncoded)); + wait(krmSetRange(tr, restore.applyMutationsMapPrefix(), allKeys, versionEncoded)); } return Void(); } @@ -4274,7 +4784,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -4292,7 +4802,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef StartFullRestoreTaskFunc::name = LiteralStringRef("restore_start"); +StringRef StartFullRestoreTaskFunc::name = "restore_start"_sr; REGISTER_TASKFUNC(StartFullRestoreTaskFunc); } // namespace fileBackup @@ -4378,7 +4888,7 @@ public: .detail("OverrideTargetVersion", targetVersion); } - Optional restoreSet = wait(bc->getRestoreSet(targetVersion)); + Optional restoreSet = wait(bc->getRestoreSet(targetVersion, cx)); if (!restoreSet.present()) { TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") @@ -4515,6 +5025,7 @@ public: int snapshotIntervalSeconds, std::string tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone stopWhenDone, UsePartitionedLog partitionedLog, IncrementalBackupOnly incrementalBackupOnly, @@ -4593,24 +5104,28 @@ public: config.clear(tr); state Key destUidValue(BinaryWriter::toValue(uid, Unversioned())); - if (normalizedRanges.size() == 1) { + if (normalizedRanges.size() == 1 || isDefaultBackup(normalizedRanges)) { RangeResult existingDestUidValues = wait( tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); bool found = false; + KeyRangeRef targetRange = + normalizedRanges.size() == 1 ? normalizedRanges[0] : getDefaultBackupSharedRange(); for (auto it : existingDestUidValues) { - if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == - normalizedRanges[0]) { + KeyRange uidRange = + BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + if (uidRange == targetRange) { destUidValue = it.value; found = true; + CODE_PROBE(targetRange == getDefaultBackupSharedRange(), + "Backup mutation sharing with default backup"); break; } } if (!found) { destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned()); - tr->set( - BinaryWriter::toValue(normalizedRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())) - .withPrefix(destUidLookupPrefix), - destUidValue); + tr->set(BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations())) + .withPrefix(destUidLookupPrefix), + destUidValue); } } @@ -4633,6 +5148,7 @@ public: config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds); config.partitionedLogEnabled().set(tr, partitionedLog); config.incrementalBackupOnly().set(tr, incrementalBackupOnly); + config.enableSnapshotBackupEncryption().set(tr, encryptionEnabled); Key taskKey = wait(fileBackup::StartFullBackupTaskFunc::addTask( tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal())); @@ -4885,7 +5401,7 @@ public: tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); try { - tr->set(backupPausedKey, pause ? LiteralStringRef("1") : LiteralStringRef("0")); + tr->set(backupPausedKey, pause ? "1"_sr : "0"_sr); wait(tr->commit()); break; } catch (Error& e) { @@ -5340,7 +5856,7 @@ public: } Optional restoreSet = - wait(bc->getRestoreSet(targetVersion, ranges, onlyApplyMutationLogs, beginVersion)); + wait(bc->getRestoreSet(targetVersion, cx, ranges, onlyApplyMutationLogs, beginVersion)); if (!restoreSet.present()) { TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") @@ -5622,6 +6138,60 @@ Future FileBackupAgent::restore(Database cx, deterministicRandom()->randomUniqueID()); } +Future FileBackupAgent::restore(Database cx, + Optional cxOrig, + Key tagName, + Key url, + Optional proxy, + WaitForComplete waitForComplete, + Version targetVersion, + Verbose verbose, + KeyRange range, + Key addPrefix, + Key removePrefix, + LockDB lockDB, + OnlyApplyMutationLogs onlyApplyMutationLogs, + InconsistentSnapshotOnly inconsistentSnapshotOnly, + Version beginVersion, + Optional const& encryptionKeyFileName) { + Standalone> rangeRef; + if (range.begin.empty() && range.end.empty()) { + addDefaultBackupRanges(rangeRef); + } else { + rangeRef.push_back_deep(rangeRef.arena(), range); + } + return restore(cx, + cxOrig, + tagName, + url, + proxy, + rangeRef, + waitForComplete, + targetVersion, + verbose, + addPrefix, + removePrefix, + lockDB, + onlyApplyMutationLogs, + inconsistentSnapshotOnly, + beginVersion, + encryptionKeyFileName); +} + +Future FileBackupAgent::atomicRestore(Database cx, + Key tagName, + KeyRange range, + Key addPrefix, + Key removePrefix) { + Standalone> rangeRef; + if (range.begin.empty() && range.end.empty()) { + addDefaultBackupRanges(rangeRef); + } else { + rangeRef.push_back_deep(rangeRef.arena(), range); + } + return atomicRestore(cx, tagName, rangeRef, addPrefix, removePrefix); +} + Future FileBackupAgent::atomicRestore(Database cx, Key tagName, Standalone> ranges, @@ -5654,6 +6224,7 @@ Future FileBackupAgent::submitBackup(Reference int snapshotIntervalSeconds, std::string const& tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone stopWhenDone, UsePartitionedLog partitionedLog, IncrementalBackupOnly incrementalBackupOnly, @@ -5666,6 +6237,7 @@ Future FileBackupAgent::submitBackup(Reference snapshotIntervalSeconds, tagName, backupRanges, + encryptionEnabled, stopWhenDone, partitionedLog, incrementalBackupOnly, @@ -5766,7 +6338,7 @@ ACTOR static Future writeKVs(Database cx, Standalone& affectedRanges) { @@ -35,32 +36,54 @@ void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std affectedRanges.push_back(KeyRangeRef(keys.end, e.end())); } -RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv) { +RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align) { ASSERT(!kv.more || kv.size() > 1); KeyRange withPrefix = KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); - ValueRef beginValue, endValue; - if (kv.size() && kv[0].key.startsWith(mapPrefix)) - beginValue = kv[0].value; - if (kv.size() && kv.end()[-1].key.startsWith(mapPrefix)) - endValue = kv.end()[-1].value; - RangeResult result; result.arena().dependsOn(kv.arena()); result.arena().dependsOn(keys.arena()); - result.push_back(result.arena(), KeyValueRef(keys.begin, beginValue)); + // Always push a kv pair <= keys.begin. + KeyRef beginKey = keys.begin; + if (!align && !kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key < withPrefix.begin) { + beginKey = kv[0].key.removePrefix(mapPrefix); + } + ValueRef beginValue; + if (!kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key <= withPrefix.begin) { + beginValue = kv.front().value; + } + result.push_back(result.arena(), KeyValueRef(beginKey, beginValue)); + for (int i = 0; i < kv.size(); i++) { if (kv[i].key > withPrefix.begin && kv[i].key < withPrefix.end) { KeyRef k = kv[i].key.removePrefix(mapPrefix); result.push_back(result.arena(), KeyValueRef(k, kv[i].value)); - } else if (kv[i].key >= withPrefix.end) + } else if (kv[i].key >= withPrefix.end) { kv.more = false; + // There should be at most 1 value past mapPrefix + keys.end. + ASSERT(i == kv.size() - 1); + break; + } } - if (!kv.more) - result.push_back(result.arena(), KeyValueRef(keys.end, endValue)); + if (!kv.more) { + KeyRef endKey = keys.end; + if (!align && !kv.empty() && kv.back().key.startsWith(mapPrefix) && kv.back().key >= withPrefix.end) { + endKey = kv.back().key.removePrefix(mapPrefix); + } + ValueRef endValue; + if (!kv.empty()) { + // In the aligned case, carry the last value to be the end value. + if (align && kv.back().key.startsWith(mapPrefix) && kv.back().key > withPrefix.end) { + endValue = result.back().value; + } else { + endValue = kv.back().value; + } + } + result.push_back(result.arena(), KeyValueRef(endKey, endValue)); + } result.more = kv.more; return result; @@ -93,6 +116,43 @@ ACTOR Future krmGetRanges(Reference tr, return krmDecodeRanges(mapPrefix, keys, kv); } +// Returns keys.begin, all transitional points in keys, and keys.end, and their values +ACTOR Future krmGetRangesUnaligned(Transaction* tr, + Key mapPrefix, + KeyRange keys, + int limit, + int limitBytes) { + KeyRange withPrefix = + KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); + + state GetRangeLimits limits(limit, limitBytes); + limits.minRows = 2; + // wait to include the next highest row >= keys.end in the result, so since end is exclusive, we need +2 and + // !orEqual + RangeResult kv = + wait(tr->getRange(lastLessOrEqual(withPrefix.begin), KeySelectorRef(withPrefix.end, false, +2), limits)); + + return krmDecodeRanges(mapPrefix, keys, kv, false); +} + +ACTOR Future krmGetRangesUnaligned(Reference tr, + Key mapPrefix, + KeyRange keys, + int limit, + int limitBytes) { + KeyRange withPrefix = + KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); + + state GetRangeLimits limits(limit, limitBytes); + limits.minRows = 2; + // wait to include the next highest row >= keys.end in the result, so since end is exclusive, we need +2 and + // !orEqual + RangeResult kv = + wait(tr->getRange(lastLessOrEqual(withPrefix.begin), KeySelectorRef(withPrefix.end, false, +2), limits)); + + return krmDecodeRanges(mapPrefix, keys, kv, false); +} + void krmSetPreviouslyEmptyRange(Transaction* tr, const KeyRef& mapPrefix, const KeyRangeRef& keys, @@ -186,7 +246,7 @@ static Future krmSetRangeCoalescing_(Transaction* tr, // Determine how far to extend this range at the beginning auto beginRange = keys[0].get(); bool hasBegin = beginRange.size() > 0 && beginRange[0].key.startsWith(mapPrefix); - Value beginValue = hasBegin ? beginRange[0].value : LiteralStringRef(""); + Value beginValue = hasBegin ? beginRange[0].value : ""_sr; state Key beginKey = withPrefix.begin; if (beginValue == value) { @@ -199,7 +259,7 @@ static Future krmSetRangeCoalescing_(Transaction* tr, bool hasEnd = endRange.size() >= 1 && endRange[0].key.startsWith(mapPrefix) && endRange[0].key <= withPrefix.end; bool hasNext = (endRange.size() == 2 && endRange[1].key.startsWith(mapPrefix)) || (endRange.size() == 1 && withPrefix.end < endRange[0].key && endRange[0].key.startsWith(mapPrefix)); - Value existingValue = hasEnd ? endRange[0].value : LiteralStringRef(""); + Value existingValue = hasEnd ? endRange[0].value : ""_sr; bool valueMatches = value == existingValue; KeyRange conflictRange = KeyRangeRef(hasBegin ? beginRange[0].key : mapPrefix, withPrefix.begin); @@ -254,3 +314,107 @@ Future krmSetRangeCoalescing(Reference const& t Value const& value) { return holdWhile(tr, krmSetRangeCoalescing_(tr.getPtr(), mapPrefix, range, maxRange, value)); } + +TEST_CASE("/keyrangemap/decoderange/aligned") { + Arena arena; + Key prefix = "/prefix/"_sr; + StringRef fullKeyA = StringRef(arena, "/prefix/a"_sr); + StringRef fullKeyB = StringRef(arena, "/prefix/b"_sr); + StringRef fullKeyC = StringRef(arena, "/prefix/c"_sr); + StringRef fullKeyD = StringRef(arena, "/prefix/d"_sr); + + StringRef keyA = StringRef(arena, "a"_sr); + StringRef keyB = StringRef(arena, "b"_sr); + StringRef keyC = StringRef(arena, "c"_sr); + StringRef keyD = StringRef(arena, "d"_sr); + StringRef keyE = StringRef(arena, "e"_sr); + StringRef keyAB = StringRef(arena, "ab"_sr); + StringRef keyAC = StringRef(arena, "ac"_sr); + StringRef keyCD = StringRef(arena, "cd"_sr); + + // Fake getRange() call. + RangeResult kv; + kv.push_back(arena, KeyValueRef(fullKeyA, keyA)); + kv.push_back(arena, KeyValueRef(fullKeyB, keyB)); + + // [A, AB(start), AC(start), B] + RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyAC), kv); + ASSERT(decodedRanges.size() == 2); + ASSERT(decodedRanges.front().key == keyAB); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyAC); + ASSERT(decodedRanges.back().value == keyA); + + kv.push_back(arena, KeyValueRef(fullKeyC, keyC)); + kv.push_back(arena, KeyValueRef(fullKeyD, keyD)); + + // [A, AB(start), B, C, CD(end), D] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv); + ASSERT(decodedRanges.size() == 4); + ASSERT(decodedRanges.front().key == keyAB); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyCD); + ASSERT(decodedRanges.back().value == keyC); + + // [""(start), A, B, C, D, E(end)] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv); + ASSERT(decodedRanges.size() == 6); + ASSERT(decodedRanges.front().key == StringRef()); + ASSERT(decodedRanges.front().value == StringRef()); + ASSERT(decodedRanges.back().key == keyE); + ASSERT(decodedRanges.back().value == keyD); + + return Void(); +} + +TEST_CASE("/keyrangemap/decoderange/unaligned") { + Arena arena; + Key prefix = "/prefix/"_sr; + StringRef fullKeyA = StringRef(arena, "/prefix/a"_sr); + StringRef fullKeyB = StringRef(arena, "/prefix/b"_sr); + StringRef fullKeyC = StringRef(arena, "/prefix/c"_sr); + StringRef fullKeyD = StringRef(arena, "/prefix/d"_sr); + + StringRef keyA = StringRef(arena, "a"_sr); + StringRef keyB = StringRef(arena, "b"_sr); + StringRef keyC = StringRef(arena, "c"_sr); + StringRef keyD = StringRef(arena, "d"_sr); + StringRef keyE = StringRef(arena, "e"_sr); + StringRef keyAB = StringRef(arena, "ab"_sr); + StringRef keyAC = StringRef(arena, "ac"_sr); + StringRef keyCD = StringRef(arena, "cd"_sr); + + // Fake getRange() call. + RangeResult kv; + kv.push_back(arena, KeyValueRef(fullKeyA, keyA)); + kv.push_back(arena, KeyValueRef(fullKeyB, keyB)); + + // [A, AB(start), AC(start), B] + RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyAC), kv, false); + ASSERT(decodedRanges.size() == 2); + ASSERT(decodedRanges.front().key == keyA); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyB); + ASSERT(decodedRanges.back().value == keyB); + + kv.push_back(arena, KeyValueRef(fullKeyC, keyC)); + kv.push_back(arena, KeyValueRef(fullKeyD, keyD)); + + // [A, AB(start), B, C, CD(end), D] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv, false); + ASSERT(decodedRanges.size() == 4); + ASSERT(decodedRanges.front().key == keyA); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyD); + ASSERT(decodedRanges.back().value == keyD); + + // [""(start), A, B, C, D, E(end)] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv, false); + ASSERT(decodedRanges.size() == 6); + ASSERT(decodedRanges.front().key == StringRef()); + ASSERT(decodedRanges.front().value == StringRef()); + ASSERT(decodedRanges.back().key == keyE); + ASSERT(decodedRanges.back().value == keyD); + + return Void(); +} \ No newline at end of file diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 6270cc0b88..7c201e9d97 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -201,6 +201,20 @@ std::map configForToken(std::string const& mode) { } out[p + key] = format("%d", tenantMode); } + + if (key == "encryption_at_rest_mode") { + EncryptionAtRestMode mode; + if (value == "disabled") { + mode = EncryptionAtRestMode::DISABLED; + } else if (value == "aes_256_ctr") { + mode = EncryptionAtRestMode::AES_256_CTR; + } else { + printf("Error: Only disabled|aes_256_ctr are valid for encryption_at_rest_mode.\n"); + return out; + } + out[p + key] = format("%d", mode); + } + return out; } @@ -830,7 +844,7 @@ ACTOR Future> getClusterConnectionStringFromSt // equal to one of the previously issued requests, there is a bug // and we are breaking the promises we make with // commit_unknown_result (the transaction must no longer be in - // progress when receiving this error). + // progress when receiving commit_unknown_result). int n = connectionStrings.size() > 0 ? connectionStrings.size() - 1 : 0; // avoid underflow for (int i = 0; i < n; ++i) { ASSERT(currentKey.get() != connectionStrings.at(i)); @@ -858,12 +872,59 @@ ACTOR Future> getClusterConnectionStringFromSt } } +ACTOR Future verifyConfigurationDatabaseAlive(Database cx) { + state Backoff backoff; + state Reference configTr; + loop { + try { + // Attempt to read a random value from the configuration + // database to make sure it is online. + configTr = ISingleThreadTransaction::create(ISingleThreadTransaction::Type::PAXOS_CONFIG, cx); + Tuple tuple; + tuple.appendNull(); // config class + tuple << "test"_sr; + Optional serializedValue = wait(configTr->get(tuple.pack())); + TraceEvent("ChangeQuorumCheckerNewCoordinatorsOnline").log(); + return Void(); + } catch (Error& e) { + TraceEvent("ChangeQuorumCheckerNewCoordinatorsError").error(e); + if (e.code() == error_code_coordinators_changed) { + wait(backoff.onError()); + configTr->reset(); + } else { + wait(configTr->onError(e)); + } + } + } +} + +ACTOR Future resetPreviousCoordinatorsKey(Database cx) { + loop { + // When the change coordinators transaction succeeds, it uses the + // special key space error message to return a message to the client. + // This causes the underlying transaction to not be committed. In order + // to make sure we clear the previous coordinators key, we have to use + // a new transaction here. + state Reference clearTr = + ISingleThreadTransaction::create(ISingleThreadTransaction::Type::RYW, cx); + try { + clearTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + clearTr->clear(previousCoordinatorsKey); + wait(clearTr->commit()); + return Void(); + } catch (Error& e2) { + wait(clearTr->onError(e2)); + } + } +} + } // namespace ACTOR Future> changeQuorumChecker(Transaction* tr, ClusterConnectionString* conn, - std::string newName) { - + std::string newName, + bool disableConfigDB) { + TraceEvent("ChangeQuorumCheckerStart").detail("NewConnectionString", conn->toString()); state Optional clusterConnectionStringOptional = wait(getClusterConnectionStringFromStorageServer(tr)); @@ -878,7 +939,7 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, conn->hostnames = old.hostnames; conn->coords = old.coords; } - std::vector desiredCoordinators = wait(conn->tryResolveHostnames()); + state std::vector desiredCoordinators = wait(conn->tryResolveHostnames()); if (desiredCoordinators.size() != conn->hostnames.size() + conn->coords.size()) { TraceEvent("ChangeQuorumCheckerEarlyTermination") .detail("Reason", "One or more hostnames are unresolvable") @@ -895,6 +956,13 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, std::sort(old.coords.begin(), old.coords.end()); if (conn->hostnames == old.hostnames && conn->coords == old.coords && old.clusterKeyName() == newName) { connectionStrings.clear(); + if (g_network->isSimulated() && g_simulator->configDBType == ConfigDBType::DISABLED) { + disableConfigDB = true; + } + if (!disableConfigDB) { + wait(verifyConfigurationDatabaseAlive(tr->getDatabase())); + } + wait(resetPreviousCoordinatorsKey(tr->getDatabase())); return CoordinatorsResult::SAME_NETWORK_ADDRESSES; } @@ -905,7 +973,7 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, int i = 0; int protectedCount = 0; while ((protectedCount < ((desiredCoordinators.size() / 2) + 1)) && (i < desiredCoordinators.size())) { - auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]); + auto process = g_simulator->getProcessByAddress(desiredCoordinators[i]); auto addresses = process->addresses; if (!process->isReliable()) { @@ -913,9 +981,9 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, continue; } - g_simulator.protectedAddresses.insert(process->addresses.address); + g_simulator->protectedAddresses.insert(process->addresses.address); if (addresses.secondaryAddress.present()) { - g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get()); + g_simulator->protectedAddresses.insert(process->addresses.secondaryAddress.get()); } TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace(); protectedCount++; @@ -944,6 +1012,9 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, when(wait(waitForAll(leaderServers))) {} when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; } } + TraceEvent("ChangeQuorumCheckerSetCoordinatorsKey") + .detail("CurrentCoordinators", old.toString()) + .detail("NewCoordinators", conn->toString()); tr->set(coordinatorsKey, conn->toString()); return Optional(); } @@ -1006,12 +1077,12 @@ ACTOR Future changeQuorum(Database cx, ReferenceisSimulated()) { for (int i = 0; i < (desiredCoordinators.size() / 2) + 1; i++) { - auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]); + auto process = g_simulator->getProcessByAddress(desiredCoordinators[i]); ASSERT(process->isReliable() || process->rebooting); - g_simulator.protectedAddresses.insert(process->addresses.address); + g_simulator->protectedAddresses.insert(process->addresses.address); if (process->addresses.secondaryAddress.present()) { - g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get()); + g_simulator->protectedAddresses.insert(process->addresses.secondaryAddress.get()); } TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace(); } @@ -1085,10 +1156,8 @@ struct AutoQuorumChange final : IQuorumChange { } ACTOR static Future getRedundancy(AutoQuorumChange* self, Transaction* tr) { - state Future> fStorageReplicas = - tr->get(LiteralStringRef("storage_replicas").withPrefix(configKeysPrefix)); - state Future> fLogReplicas = - tr->get(LiteralStringRef("log_replicas").withPrefix(configKeysPrefix)); + state Future> fStorageReplicas = tr->get("storage_replicas"_sr.withPrefix(configKeysPrefix)); + state Future> fLogReplicas = tr->get("log_replicas"_sr.withPrefix(configKeysPrefix)); wait(success(fStorageReplicas) && success(fLogReplicas)); int redundancy = std::min(atoi(fStorageReplicas.get().get().toString().c_str()), atoi(fLogReplicas.get().get().toString().c_str())); @@ -1250,10 +1319,7 @@ struct AutoQuorumChange final : IQuorumChange { std::map> currentCounts; std::map hardLimits; - std::vector fields({ LiteralStringRef("dcid"), - LiteralStringRef("data_hall"), - LiteralStringRef("zoneid"), - LiteralStringRef("machineid") }); + std::vector fields({ "dcid"_sr, "data_hall"_sr, "zoneid"_sr, "machineid"_sr }); for (auto field = fields.begin(); field != fields.end(); field++) { if (field->toString() == "zoneid") { @@ -1270,7 +1336,7 @@ struct AutoQuorumChange final : IQuorumChange { continue; } // Exclude faulty node due to machine assassination - if (g_network->isSimulated() && !g_simulator.getProcessByAddress(worker->address)->isReliable()) { + if (g_network->isSimulated() && !g_simulator->getProcessByAddress(worker->address)->isReliable()) { TraceEvent("AutoSelectCoordinators").detail("SkipUnreliableWorker", worker->address.toString()); continue; } @@ -1279,7 +1345,7 @@ struct AutoQuorumChange final : IQuorumChange { if (maxCounts[*field] == 0) { maxCounts[*field] = 1; } - auto value = worker->locality.get(*field).orDefault(LiteralStringRef("")); + auto value = worker->locality.get(*field).orDefault(""_sr); auto currentCount = currentCounts[*field][value]; if (currentCount >= maxCounts[*field]) { valid = false; @@ -1288,7 +1354,7 @@ struct AutoQuorumChange final : IQuorumChange { } if (valid) { for (auto field = fields.begin(); field != fields.end(); field++) { - auto value = worker->locality.get(*field).orDefault(LiteralStringRef("")); + auto value = worker->locality.get(*field).orDefault(""_sr); currentCounts[*field][value] += 1; } chosen.push_back(worker->address); @@ -1341,6 +1407,7 @@ ACTOR Future excludeServers(Database cx, std::vector ser state ReadYourWritesTransaction ryw(cx); loop { try { + ryw.setOption(FDBTransactionOptions::RAW_ACCESS); ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); ryw.set( SpecialKeySpace::getManagementApiCommandOptionSpecialKey(failed ? "failed" : "excluded", "force"), @@ -1403,6 +1470,7 @@ ACTOR Future excludeLocalities(Database cx, std::unordered_set includeServers(Database cx, std::vector ser state ReadYourWritesTransaction ryw(cx); loop { try { + ryw.setOption(FDBTransactionOptions::RAW_ACCESS); ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); for (auto& s : servers) { if (!s.isValid()) { @@ -1467,8 +1536,7 @@ ACTOR Future includeServers(Database cx, std::vector ser // This is why we now make two clears: first only of the ip // address, the second will delete all ports. if (s.isWholeMachine()) - ryw.clear(KeyRangeRef(addr.withSuffix(LiteralStringRef(":")), - addr.withSuffix(LiteralStringRef(";")))); + ryw.clear(KeyRangeRef(addr.withSuffix(":"_sr), addr.withSuffix(";"_sr))); } } TraceEvent("IncludeServersCommit").detail("Servers", describe(servers)).detail("Failed", failed); @@ -1548,6 +1616,7 @@ ACTOR Future includeLocalities(Database cx, std::vector local state ReadYourWritesTransaction ryw(cx); loop { try { + ryw.setOption(FDBTransactionOptions::RAW_ACCESS); ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); if (includeAll) { if (failed) { @@ -2047,9 +2116,7 @@ ACTOR Future lockDatabase(Transaction* tr, UID id) { } tr->atomicOp(databaseLockedKey, - BinaryWriter::toValue(id, Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), + BinaryWriter::toValue(id, Unversioned()).withPrefix("0123456789"_sr).withSuffix("\x00\x00\x00\x00"_sr), MutationRef::SetVersionstampedValue); tr->addWriteConflictRange(normalKeys); return Void(); @@ -2070,9 +2137,7 @@ ACTOR Future lockDatabase(Reference tr, UID id) } tr->atomicOp(databaseLockedKey, - BinaryWriter::toValue(id, Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), + BinaryWriter::toValue(id, Unversioned()).withPrefix("0123456789"_sr).withSuffix("\x00\x00\x00\x00"_sr), MutationRef::SetVersionstampedValue); tr->addWriteConflictRange(normalKeys); return Void(); @@ -2191,7 +2256,7 @@ ACTOR Future updateChangeFeed(Transaction* tr, Key rangeID, ChangeFeedStat } else if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { if (val.present()) { if (g_network->isSimulated()) { - g_simulator.validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); + g_simulator->validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); } tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), @@ -2229,7 +2294,7 @@ ACTOR Future updateChangeFeed(Reference tr, } else if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { if (val.present()) { if (g_network->isSimulated()) { - g_simulator.validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); + g_simulator->validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); } tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), @@ -2542,24 +2607,24 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { auto dataHall = dataCenter + std::to_string(i / 2 % 2); auto rack = dataHall + std::to_string(i % 2); auto machineId = rack + std::to_string(i); - data.locality.set(LiteralStringRef("dcid"), StringRef(dataCenter)); - data.locality.set(LiteralStringRef("data_hall"), StringRef(dataHall)); - data.locality.set(LiteralStringRef("rack"), StringRef(rack)); - data.locality.set(LiteralStringRef("zoneid"), StringRef(rack)); - data.locality.set(LiteralStringRef("machineid"), StringRef(machineId)); + data.locality.set("dcid"_sr, StringRef(dataCenter)); + data.locality.set("data_hall"_sr, StringRef(dataHall)); + data.locality.set("rack"_sr, StringRef(rack)); + data.locality.set("zoneid"_sr, StringRef(rack)); + data.locality.set("machineid"_sr, StringRef(machineId)); data.address.ip = IPAddress(i); if (g_network->isSimulated()) { - g_simulator.newProcess("TestCoordinator", - data.address.ip, - data.address.port, - false, - 1, - data.locality, - ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource), - "", - "", - currentProtocolVersion()); + g_simulator->newProcess("TestCoordinator", + data.address.ip, + data.address.port, + false, + 1, + data.locality, + ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource), + "", + "", + currentProtocolVersion()); } workers.push_back(data); @@ -2572,10 +2637,7 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { std::map> chosenValues; ASSERT(chosen.size() == 5); - std::vector fields({ LiteralStringRef("dcid"), - LiteralStringRef("data_hall"), - LiteralStringRef("zoneid"), - LiteralStringRef("machineid") }); + std::vector fields({ "dcid"_sr, "data_hall"_sr, "zoneid"_sr, "machineid"_sr }); for (auto worker = chosen.begin(); worker != chosen.end(); worker++) { ASSERT(worker->ip.toV4() < workers.size()); LocalityData data = workers[worker->ip.toV4()].locality; @@ -2584,10 +2646,10 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { } } - ASSERT(chosenValues[LiteralStringRef("dcid")].size() == 2); - ASSERT(chosenValues[LiteralStringRef("data_hall")].size() == 4); - ASSERT(chosenValues[LiteralStringRef("zoneid")].size() == 5); - ASSERT(chosenValues[LiteralStringRef("machineid")].size() == 5); + ASSERT(chosenValues["dcid"_sr].size() == 2); + ASSERT(chosenValues["data_hall"_sr].size() == 4); + ASSERT(chosenValues["zoneid"_sr].size() == 5); + ASSERT(chosenValues["machineid"_sr].size() == 5); ASSERT(std::find(chosen.begin(), chosen.end(), workers[noAssignIndex].address) != chosen.end()); return Void(); diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp index 6463033db8..993b70fa70 100644 --- a/fdbclient/Metacluster.cpp +++ b/fdbclient/Metacluster.cpp @@ -24,6 +24,19 @@ FDB_DEFINE_BOOLEAN_PARAM(AddNewTenants); FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants); +std::string clusterTypeToString(const ClusterType& clusterType) { + switch (clusterType) { + case ClusterType::STANDALONE: + return "standalone"; + case ClusterType::METACLUSTER_MANAGEMENT: + return "metacluster_management"; + case ClusterType::METACLUSTER_DATA: + return "metacluster_data"; + default: + return "unknown"; + } +} + std::string DataClusterEntry::clusterStateToString(DataClusterState clusterState) { switch (clusterState) { case DataClusterState::READY: diff --git a/fdbclient/MetaclusterManagement.actor.cpp b/fdbclient/MetaclusterManagement.actor.cpp index 33403300bd..d354ac77d1 100644 --- a/fdbclient/MetaclusterManagement.actor.cpp +++ b/fdbclient/MetaclusterManagement.actor.cpp @@ -27,6 +27,17 @@ namespace MetaclusterAPI { +std::pair metaclusterCapacity(std::map const& clusters) { + ClusterUsage tenantGroupCapacity; + ClusterUsage tenantGroupsAllocated; + for (auto cluster : clusters) { + tenantGroupCapacity.numTenantGroups += + std::max(cluster.second.entry.capacity.numTenantGroups, cluster.second.entry.allocated.numTenantGroups); + tenantGroupsAllocated.numTenantGroups += cluster.second.entry.allocated.numTenantGroups; + } + return { tenantGroupCapacity, tenantGroupsAllocated }; +} + ACTOR Future> openDatabase(ClusterConnectionString connectionString) { if (g_network->isSimulated()) { Reference clusterFile = diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 977d465908..6c34369e6a 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -248,7 +248,7 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString/hostname") { hostnames.push_back(Hostname::parse(hn1 + ":" + port1)); hostnames.push_back(Hostname::parse(hn2 + ":" + port2)); - ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0")); + ClusterConnectionString cs(hostnames, "TestCluster:0"_sr); ASSERT(cs.hostnames.size() == 2); ASSERT(cs.coords.size() == 0); ASSERT(cs.toString() == connectionString); @@ -259,7 +259,7 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString/hostname") { hostnames.push_back(Hostname::parse(hn1 + ":" + port1)); hostnames.push_back(Hostname::parse(hn1 + ":" + port1)); try { - ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0")); + ClusterConnectionString cs(hostnames, "TestCluster:0"_sr); } catch (Error& e) { ASSERT(e.code() == error_code_connection_string_invalid); } @@ -367,7 +367,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") { auto c = connectionString.begin(); while (c != connectionString.end()) { if (deterministicRandom()->random01() < 0.1) // Add whitespace character - output += deterministicRandom()->randomChoice(LiteralStringRef(" \t\n\r")); + output += deterministicRandom()->randomChoice(" \t\n\r"_sr); if (deterministicRandom()->random01() < 0.5) { // Add one of the input characters output += *c; ++c; @@ -376,9 +376,9 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") { output += "#"; int charCount = deterministicRandom()->randomInt(0, 20); for (int i = 0; i < charCount; i++) { - output += deterministicRandom()->randomChoice(LiteralStringRef("asdfzxcv123345:!@#$#$&()<\"\' \t")); + output += deterministicRandom()->randomChoice("asdfzxcv123345:!@#$#$&()<\"\' \t"_sr); } - output += deterministicRandom()->randomChoice(LiteralStringRef("\n\r")); + output += deterministicRandom()->randomChoice("\n\r"_sr); } } @@ -501,6 +501,7 @@ ACTOR Future monitorNominee(Key key, Optional* info) { loop { state Optional li; + wait(Future(Void())); // Make sure we weren't cancelled if (coord.hostname.present()) { wait(store(li, retryGetReplyFromHostname(GetLeaderRequest(key, info->present() ? info->get().changeID : UID()), @@ -861,6 +862,7 @@ ACTOR Future monitorProxiesOneGeneration( for (const auto& c : cs.coords) { clientLeaderServers.push_back(ClientLeaderRegInterface(c)); } + ASSERT(clientLeaderServers.size() > 0); deterministicRandom()->randomShuffle(clientLeaderServers); @@ -880,7 +882,7 @@ ACTOR Future monitorProxiesOneGeneration( bool upToDate = wait(connRecord->upToDate(storedConnectionString)); if (upToDate) { incorrectTime = Optional(); - } else if (allConnectionsFailed) { + } else if (allConnectionsFailed && storedConnectionString.getNumberOfCoordinators() > 0) { // Failed to connect to all coordinators from the current connection string, // so it is not possible to get any new updates from the cluster. It can be that // all the coordinators have changed, but the client missed that, because it had @@ -894,7 +896,7 @@ ACTOR Future monitorProxiesOneGeneration( info.intermediateConnRecord = connRecord; return info; } else { - req.issues.push_back_deep(req.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents")); + req.issues.push_back_deep(req.issues.arena(), "incorrect_cluster_file_contents"_sr); std::string connectionString = connRecord->getConnectionString().toString(); if (!incorrectTime.present()) { incorrectTime = now(); @@ -938,6 +940,7 @@ ACTOR Future monitorProxiesOneGeneration( .detail("OldConnStr", info.intermediateConnRecord->getConnectionString().toString()); info.intermediateConnRecord = connRecord->makeIntermediateRecord( ClusterConnectionString(rep.get().read().forward.get().toString())); + ASSERT(info.intermediateConnRecord->getConnectionString().getNumberOfCoordinators() > 0); return info; } if (connRecord != info.intermediateConnRecord) { @@ -963,7 +966,6 @@ ACTOR Future monitorProxiesOneGeneration( } else { CODE_PROBE(rep.getError().code() == error_code_failed_to_progress, "Coordinator cant talk to cluster controller"); - CODE_PROBE(rep.getError().code() == error_code_lookup_failed, "Coordinator hostname resolving failure"); TraceEvent("MonitorProxiesConnectFailed") .detail("Error", rep.getError().name()) .detail("Coordinator", clientLeaderServer.getAddressString()); @@ -984,6 +986,7 @@ ACTOR Future monitorProxies( Key traceLogGroup) { state MonitorLeaderInfo info(connRecord->get()); loop { + ASSERT(connRecord->get().isValid()); choose { when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration( connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) { diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 1d88e37b1a..401246439d 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -280,10 +280,46 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) { - if (!api->transactionReadBlobGranules) { + return unsupported_operation(); +} + +ThreadFuture>> DLTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { + if (!api->transactionReadBlobGranulesStart) { return unsupported_operation(); } + int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; + + FdbCApi::FDBFuture* f = api->transactionReadBlobGranulesStart(tr, + keyRange.begin.begin(), + keyRange.begin.size(), + keyRange.end.begin(), + keyRange.end.size(), + beginVersion, + rv, + readVersionOut); + + return ThreadFuture>>( + (ThreadSingleAssignmentVar>>*)(f)); +}; + +ThreadResult DLTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { + if (!api->transactionReadBlobGranulesFinish) { + return unsupported_operation(); + } + + // convert back to fdb future for API + FdbCApi::FDBFuture* f = (FdbCApi::FDBFuture*)(startFuture.extractPtr()); + // FIXME: better way to convert here? FdbCApi::FDBReadBlobGranuleContext context; context.userContext = granuleContext.userContext; @@ -293,17 +329,40 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key context.debugNoMaterialize = granuleContext.debugNoMaterialize; context.granuleParallelism = granuleContext.granuleParallelism; - int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; + FdbCApi::FDBResult* r = api->transactionReadBlobGranulesFinish(tr, + f, + keyRange.begin.begin(), + keyRange.begin.size(), + keyRange.end.begin(), + keyRange.end.size(), + beginVersion, + readVersion, + &context); - FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr, - keyRange.begin.begin(), - keyRange.begin.size(), - keyRange.end.begin(), - keyRange.end.size(), - beginVersion, - rv, - context); return ThreadResult((ThreadSingleAssignmentVar*)(r)); +}; + +ThreadFuture>> +DLTransaction::summarizeBlobGranules(const KeyRangeRef& keyRange, Optional summaryVersion, int rangeLimit) { + if (!api->transactionSummarizeBlobGranules) { + return unsupported_operation(); + } + + int64_t sv = summaryVersion.present() ? summaryVersion.get() : latestVersion; + + FdbCApi::FDBFuture* f = api->transactionSummarizeBlobGranules( + tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), sv, rangeLimit); + + return toThreadFuture>>( + api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + const FdbCApi::FDBGranuleSummary* summaries; + int summariesLength; + FdbCApi::fdb_error_t error = api->futureGetGranuleSummaryArray(f, &summaries, &summariesLength); + ASSERT(!error); + // The memory for this is stored in the FDBFuture and is released when the future gets destroyed + return Standalone>( + VectorRef((BlobGranuleSummaryRef*)summaries, summariesLength), Arena()); + }); } void DLTransaction::addReadConflictRange(const KeyRangeRef& keys) { @@ -593,7 +652,7 @@ ThreadFuture DLDatabase::blobbifyRange(const KeyRangeRef& keyRange) { db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { - bool ret = false; + FdbCApi::fdb_bool_t ret = false; ASSERT(!api->futureGetBool(f, &ret)); return ret; }); @@ -608,7 +667,7 @@ ThreadFuture DLDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { - bool ret = false; + FdbCApi::fdb_bool_t ret = false; ASSERT(!api->futureGetBool(f, &ret)); return ret; }); @@ -639,8 +698,10 @@ ThreadFuture DLDatabase::verifyBlobRange(const KeyRangeRef& keyRange, O return unsupported_operation(); } + Version readVersion = version.present() ? version.get() : latestVersion; + FdbCApi::FDBFuture* f = api->databaseVerifyBlobRange( - db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), version); + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), readVersion); return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { Version version = invalidVersion; @@ -692,8 +753,11 @@ void DLApi::init() { loadClientFunction(&api->selectApiVersion, lib, fdbCPath, "fdb_select_api_version_impl", headerVersion >= 0); loadClientFunction(&api->getClientVersion, lib, fdbCPath, "fdb_get_client_version", headerVersion >= 410); - loadClientFunction( - &api->useFutureProtocolVersion, lib, fdbCPath, "fdb_use_future_protocol_version", headerVersion >= 720); + loadClientFunction(&api->useFutureProtocolVersion, + lib, + fdbCPath, + "fdb_use_future_protocol_version", + headerVersion >= ApiVersion::withFutureProtocolVersionApi().version()); loadClientFunction(&api->setNetworkOption, lib, fdbCPath, "fdb_network_set_option", headerVersion >= 0); loadClientFunction(&api->setupNetwork, lib, fdbCPath, "fdb_setup_network", headerVersion >= 0); loadClientFunction(&api->runNetwork, lib, fdbCPath, "fdb_run_network", headerVersion >= 0); @@ -703,7 +767,7 @@ void DLApi::init() { lib, fdbCPath, "fdb_create_database_from_connection_string", - headerVersion >= 720); + headerVersion >= ApiVersion::withCreateDBFromConnString().version()); loadClientFunction(&api->databaseOpenTenant, lib, fdbCPath, "fdb_database_open_tenant", headerVersion >= 710); loadClientFunction( @@ -736,23 +800,39 @@ void DLApi::init() { fdbCPath, "fdb_database_wait_purge_granules_complete", headerVersion >= 710); - loadClientFunction(&api->databaseBlobbifyRange, lib, fdbCPath, "fdb_database_blobbify_range", headerVersion >= 720); - loadClientFunction( - &api->databaseUnblobbifyRange, lib, fdbCPath, "fdb_database_unblobbify_range", headerVersion >= 720); - loadClientFunction( - &api->databaseListBlobbifiedRanges, lib, fdbCPath, "fdb_database_list_blobbified_ranges", headerVersion >= 720); - loadClientFunction( - &api->databaseVerifyBlobRange, lib, fdbCPath, "fdb_database_verify_blob_range", headerVersion >= 720); + loadClientFunction(&api->databaseBlobbifyRange, + lib, + fdbCPath, + "fdb_database_blobbify_range", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->databaseUnblobbifyRange, + lib, + fdbCPath, + "fdb_database_unblobbify_range", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->databaseListBlobbifiedRanges, + lib, + fdbCPath, + "fdb_database_list_blobbified_ranges", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->databaseVerifyBlobRange, + lib, + fdbCPath, + "fdb_database_verify_blob_range", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction( &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710); - loadClientFunction( - &api->tenantPurgeBlobGranules, lib, fdbCPath, "fdb_tenant_purge_blob_granules", headerVersion >= 720); + loadClientFunction(&api->tenantPurgeBlobGranules, + lib, + fdbCPath, + "fdb_tenant_purge_blob_granules", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->tenantWaitPurgeGranulesComplete, lib, fdbCPath, "fdb_tenant_wait_purge_granules_complete", - headerVersion >= 720); + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710); loadClientFunction(&api->transactionSetOption, lib, fdbCPath, "fdb_transaction_set_option", headerVersion >= 0); @@ -812,12 +892,31 @@ void DLApi::init() { headerVersion >= 710); loadClientFunction( &api->transactionReadBlobGranules, lib, fdbCPath, "fdb_transaction_read_blob_granules", headerVersion >= 710); + loadClientFunction(&api->transactionReadBlobGranulesStart, + lib, + fdbCPath, + "fdb_transaction_read_blob_granules_start", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->transactionReadBlobGranulesFinish, + lib, + fdbCPath, + "fdb_transaction_read_blob_granules_finish", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->transactionSummarizeBlobGranules, + lib, + fdbCPath, + "fdb_transaction_summarize_blob_granules", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version", headerVersion >= 0); - loadClientFunction(&api->futureGetBool, lib, fdbCPath, "fdb_future_get_bool", headerVersion >= 720); + loadClientFunction(&api->futureGetBool, + lib, + fdbCPath, + "fdb_future_get_bool", + headerVersion >= ApiVersion::withFutureGetBool().version()); loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700); loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error", headerVersion >= 0); loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key", headerVersion >= 0); @@ -830,6 +929,11 @@ void DLApi::init() { &api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0); loadClientFunction( &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 710); + loadClientFunction(&api->futureGetGranuleSummaryArray, + lib, + fdbCPath, + "fdb_future_get_granule_summary_array", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->futureGetSharedState, lib, fdbCPath, "fdb_future_get_shared_state", headerVersion >= 710); loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0); loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0); @@ -1165,14 +1269,55 @@ ThreadResult MultiVersionTransaction::readBlobGranules(const KeyRan Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) { + // FIXME: prevent from calling this from another main thread? auto tr = getTransaction(); if (tr.transaction) { - return tr.transaction->readBlobGranules(keyRange, beginVersion, readVersion, granuleContext); + Version readVersionOut; + auto f = tr.transaction->readBlobGranulesStart(keyRange, beginVersion, readVersion, &readVersionOut); + auto abortableF = abortableFuture(f, tr.onChange); + abortableF.blockUntilReadyCheckOnMainThread(); + if (abortableF.isError()) { + return ThreadResult(abortableF.getError()); + } + if (granuleContext.debugNoMaterialize) { + return ThreadResult(blob_granule_not_materialized()); + } + return tr.transaction->readBlobGranulesFinish( + abortableF, keyRange, beginVersion, readVersionOut, granuleContext); } else { return abortableTimeoutResult(tr.onChange); } } +ThreadFuture>> MultiVersionTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { + // can't call this directly + return ThreadFuture>>(unsupported_operation()); +} + +ThreadResult MultiVersionTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { + // can't call this directly + return ThreadResult(unsupported_operation()); +} + +ThreadFuture>> MultiVersionTransaction::summarizeBlobGranules( + const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) { + auto tr = getTransaction(); + auto f = tr.transaction ? tr.transaction->summarizeBlobGranules(keyRange, summaryVersion, rangeLimit) + : makeTimeout>>(); + return abortableFuture(f, tr.onChange); +} + void MultiVersionTransaction::atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) { auto tr = getTransaction(); if (tr.transaction) { @@ -1267,7 +1412,7 @@ void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Op throw invalid_option(); } - if (MultiVersionApi::apiVersionAtLeast(610) && itr->second.persistent) { + if (MultiVersionApi::api->getApiVersion().hasPersistentOptions() && itr->second.persistent) { persistentOptions.emplace_back(option, value.castTo>()); } @@ -1316,7 +1461,10 @@ Optional MultiVersionTransaction::getTenant() { // Waits for the specified duration and signals the assignment variable with a timed out error // This will be canceled if a new timeout is set, in which case the tsav will not be signaled. ACTOR Future timeoutImpl(Reference> tsav, double duration) { - wait(delay(duration)); + state double endTime = now() + duration; + while (now() < endTime) { + wait(delayUntil(std::min(endTime + 0.0001, now() + CLIENT_KNOBS->TRANSACTION_TIMEOUT_DELAY_INTERVAL))); + } tsav->trySendError(transaction_timed_out()); return Void(); @@ -1356,14 +1504,17 @@ void MultiVersionTransaction::setTimeout(Optional value) { { // lock scope ThreadSpinLockHolder holder(timeoutLock); - - Reference> tsav = timeoutTsav; - ThreadFuture newTimeout = onMainThread([transactionStartTime, tsav, timeoutDuration]() { - return timeoutImpl(tsav, timeoutDuration - std::max(0.0, now() - transactionStartTime)); - }); - prevTimeout = currentTimeout; - currentTimeout = newTimeout; + + if (timeoutDuration > 0) { + Reference> tsav = timeoutTsav; + ThreadFuture newTimeout = onMainThread([transactionStartTime, tsav, timeoutDuration]() { + return timeoutImpl(tsav, timeoutDuration - std::max(0.0, now() - transactionStartTime)); + }); + currentTimeout = newTimeout; + } else { + currentTimeout = ThreadFuture(); + } } // Cancel the previous timeout now that we have a new one. This means that changing the timeout @@ -1432,6 +1583,9 @@ void MultiVersionTransaction::reset() { MultiVersionTransaction::~MultiVersionTransaction() { timeoutTsav->trySendError(transaction_cancelled()); + if (currentTimeout.isValid()) { + currentTimeout.cancel(); + } } bool MultiVersionTransaction::isValid() { @@ -1785,7 +1939,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion .detail("OldProtocolVersion", dbProtocolVersion); // When the protocol version changes, clear the corresponding entry in the shared state map // so it can be re-initialized. Only do so if there was a valid previous protocol version. - if (dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) { + if (dbProtocolVersion.present() && MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) { MultiVersionApi::api->clearClusterSharedStateMapEntry(clusterId, dbProtocolVersion.get()); } @@ -1814,7 +1968,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion return; } - if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) { + if (client->external && !MultiVersionApi::api->getApiVersion().hasInlineUpdateDatabase()) { // Old API versions return a future when creating the database, so we need to wait for it Reference self = Reference::addRef(this); dbReady = mapThreadFuture( @@ -1898,7 +2052,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference ne .detail("ConnectionRecord", connectionRecord); } } - if (db.isValid() && dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) { + if (db.isValid() && dbProtocolVersion.present() && + MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) { Future updateResult = MultiVersionApi::api->updateClusterSharedStateMap(connectionRecord, dbProtocolVersion.get(), db); sharedStateUpdater = map(errorOr(updateResult), [this](ErrorOr result) { @@ -2018,11 +2173,6 @@ void MultiVersionDatabase::LegacyVersionMonitor::close() { } // MultiVersionApi -bool MultiVersionApi::apiVersionAtLeast(int minVersion) { - ASSERT_NE(MultiVersionApi::api->apiVersion, 0); - return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0; -} - void MultiVersionApi::runOnExternalClientsAllThreads(std::function)> func, bool runOnFailedClients) { for (int i = 0; i < threadCount; i++) { @@ -2068,17 +2218,18 @@ Reference MultiVersionApi::getLocalClient() { } void MultiVersionApi::selectApiVersion(int apiVersion) { + ApiVersion newApiVersion(apiVersion); if (!localClient) { localClient = makeReference(getLocalClientAPI()); ASSERT(localClient); } - if (this->apiVersion != 0 && this->apiVersion != apiVersion) { + if (this->apiVersion.isValid() && this->apiVersion != newApiVersion) { throw api_version_already_set(); } localClient->api->selectApiVersion(apiVersion); - this->apiVersion = apiVersion; + this->apiVersion = newApiVersion; } const char* MultiVersionApi::getClientVersion() { @@ -2106,7 +2257,7 @@ void validateOption(Optional value, bool canBePresent, bool canBeAbse void MultiVersionApi::disableMultiVersionClientApi() { MutexHolder holder(lock); - if (networkStartSetup || localClientDisabled) { + if (networkStartSetup || localClientDisabled || disableBypass) { throw invalid_option(); } @@ -2313,6 +2464,13 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, externalClient = true; bypassMultiClientApi = true; forwardOption = true; + } else if (option == FDBNetworkOptions::DISABLE_CLIENT_BYPASS) { + MutexHolder holder(lock); + ASSERT(!networkStartSetup); + if (bypassMultiClientApi) { + throw invalid_option(); + } + disableBypass = true; } else if (option == FDBNetworkOptions::CLIENT_THREADS_PER_VERSION) { MutexHolder holder(lock); validateOption(value, true, false, false); @@ -2331,6 +2489,18 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, } else if (option == FDBNetworkOptions::FUTURE_VERSION_CLIENT_LIBRARY) { validateOption(value, true, false, false); addExternalLibrary(abspath(value.get().toString()), true); + } else if (option == FDBNetworkOptions::TRACE_FILE_IDENTIFIER) { + validateOption(value, true, false, true); + traceFileIdentifier = value.get().toString(); + { + MutexHolder holder(lock); + // Forward the option unmodified only to the the local client and let it validate it. + // While for external clients the trace file identifiers are determined in setupNetwork + localClient->api->setNetworkOption(option, value); + } + } else if (option == FDBNetworkOptions::TRACE_SHARE_AMONG_CLIENT_THREADS) { + validateOption(value, false, true); + traceShareBaseNameAmongThreads = true; } else { forwardOption = true; } @@ -2374,9 +2544,13 @@ void MultiVersionApi::setupNetwork() { // Copy external lib for each thread if (externalClients.count(filename) == 0) { externalClients[filename] = {}; - for (const auto& tmp : copyExternalLibraryPerThread(path)) { + auto libCopies = copyExternalLibraryPerThread(path); + for (int idx = 0; idx < libCopies.size(); ++idx) { externalClients[filename].push_back(Reference( - new ClientInfo(new DLApi(tmp.first, tmp.second /*unlink on load*/), path, useFutureVersion))); + new ClientInfo(new DLApi(libCopies[idx].first, libCopies[idx].second /*unlink on load*/), + path, + useFutureVersion, + idx))); } } } @@ -2395,7 +2569,7 @@ void MultiVersionApi::setupNetwork() { networkStartSetup = true; - if (externalClients.empty()) { + if (externalClients.empty() && !disableBypass) { bypassMultiClientApi = true; // SOMEDAY: we won't be able to set this option once it becomes possible to add // clients after setupNetwork is called } @@ -2415,20 +2589,30 @@ void MultiVersionApi::setupNetwork() { if (!bypassMultiClientApi) { runOnExternalClientsAllThreads([this](Reference client) { TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath); - client->api->selectApiVersion(apiVersion); + client->api->selectApiVersion(apiVersion.version()); if (client->useFutureVersion) { client->api->useFutureProtocolVersion(); } client->loadVersion(); }); + std::string baseTraceFileId; + if (apiVersion.hasTraceFileIdentifier()) { + // TRACE_FILE_IDENTIFIER option is supported since 6.3 + baseTraceFileId = traceFileIdentifier.empty() ? format("%d", getpid()) : traceFileIdentifier; + } + MutexHolder holder(lock); - runOnExternalClientsAllThreads([this, transportId](Reference client) { + runOnExternalClientsAllThreads([this, transportId, baseTraceFileId](Reference client) { for (auto option : options) { client->api->setNetworkOption(option.first, option.second.castTo()); } client->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID, std::to_string(transportId)); - + if (!baseTraceFileId.empty()) { + client->api->setNetworkOption( + FDBNetworkOptions::TRACE_FILE_IDENTIFIER, + traceShareBaseNameAmongThreads ? baseTraceFileId : client->getTraceFileIdentifier(baseTraceFileId)); + } client->api->setupNetwork(); }); @@ -2467,21 +2651,17 @@ void MultiVersionApi::runNetwork() { std::vector handles; if (!bypassMultiClientApi) { - for (int threadNum = 0; threadNum < threadCount; threadNum++) { - runOnExternalClients(threadNum, [&handles, threadNum](Reference client) { - if (client->external) { - std::string threadName = format("fdb-%s-%d", client->releaseVersion.c_str(), threadNum); - if (threadName.size() > 15) { - threadName = format("fdb-%s", client->releaseVersion.c_str()); - if (threadName.size() > 15) { - threadName = "fdb-external"; - } - } - handles.push_back( - g_network->startThread(&runNetworkThread, client.getPtr(), 0, threadName.c_str())); + runOnExternalClientsAllThreads([&handles](Reference client) { + ASSERT(client->external); + std::string threadName = format("fdb-%s-%d", client->releaseVersion.c_str(), client->threadIndex); + if (threadName.size() > 15) { + threadName = format("fdb-%s", client->releaseVersion.c_str()); + if (threadName.size() > 15) { + threadName = "fdb-external"; } - }); - } + } + handles.push_back(g_network->startThread(&runNetworkThread, client.getPtr(), 0, threadName.c_str())); + }); } localClient->api->runNetwork(); @@ -2594,9 +2774,9 @@ ACTOR Future updateClusterSharedStateMapImpl(MultiVersionApi* self, ProtocolVersion dbProtocolVersion, Reference db) { // The cluster ID will be the connection record string (either a filename or the connection string itself) - // in API versions before we could read the cluster ID. + // in versions before we could read the cluster ID. state std::string clusterId = connectionRecord.toString(); - if (MultiVersionApi::apiVersionAtLeast(720)) { + if (dbProtocolVersion.hasClusterIdSpecialKey()) { state Reference tr = db->createTransaction(); loop { try { @@ -2770,8 +2950,8 @@ void MultiVersionApi::loadEnvironmentVariableNetworkOptions() { MultiVersionApi::MultiVersionApi() : callbackOnMainThread(true), localClientDisabled(false), networkStartSetup(false), networkSetup(false), - bypassMultiClientApi(false), externalClient(false), apiVersion(0), threadCount(0), tmpDir("/tmp"), - envOptionsLoaded(false) {} + disableBypass(false), bypassMultiClientApi(false), externalClient(false), apiVersion(0), threadCount(0), + tmpDir("/tmp"), traceShareBaseNameAmongThreads(false), envOptionsLoaded(false) {} MultiVersionApi* MultiVersionApi::api = new MultiVersionApi(); @@ -2808,6 +2988,12 @@ bool ClientInfo::canReplace(Reference other) const { return !protocolVersion.isCompatible(other->protocolVersion); } +std::string ClientInfo::getTraceFileIdentifier(const std::string& baseIdentifier) { + std::string versionStr = releaseVersion; + std::replace(versionStr.begin(), versionStr.end(), '.', '_'); + return format("%s_v%st%d", baseIdentifier.c_str(), versionStr.c_str(), threadIndex); +} + // UNIT TESTS TEST_CASE("/fdbclient/multiversionclient/EnvironmentVariableParsing") { auto vals = parseOptionValues("a"); @@ -3093,7 +3279,7 @@ struct AbortableTest { } }; -TEST_CASE("/fdbclient/multiversionclient/AbortableSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/AbortableSingleAssignmentVar") { state volatile bool done = false; state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest, (void*)&done); @@ -3170,7 +3356,7 @@ struct DLTest { } }; -TEST_CASE("/fdbclient/multiversionclient/DLSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/DLSingleAssignmentVar") { state volatile bool done = false; MultiVersionApi::api->callbackOnMainThread = true; @@ -3214,7 +3400,7 @@ struct MapTest { } }; -TEST_CASE("/fdbclient/multiversionclient/MapSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/MapSingleAssignmentVar") { state volatile bool done = false; state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest, (void*)&done); @@ -3253,7 +3439,7 @@ struct FlatMapTest { } }; -TEST_CASE("/fdbclient/multiversionclient/FlatMapSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/FlatMapSingleAssignmentVar") { state volatile bool done = false; state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest, (void*)&done); diff --git a/fdbclient/MutationLogReader.actor.cpp b/fdbclient/MutationLogReader.actor.cpp index 5919fdc66b..d6e3adc8dd 100644 --- a/fdbclient/MutationLogReader.actor.cpp +++ b/fdbclient/MutationLogReader.actor.cpp @@ -67,7 +67,7 @@ ACTOR Future PipelinedReader::getNext_impl(PipelinedReader* self, Database state Transaction tr(cx); state GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, - (g_network->isSimulated() && !g_simulator.speedUpSimulation) + (g_network->isSimulated() && !g_simulator->speedUpSimulation) ? CLIENT_KNOBS->BACKUP_SIMULATED_LIMIT_BYTES : CLIENT_KNOBS->BACKUP_GET_RANGE_LIMIT_BYTES); @@ -179,7 +179,7 @@ ACTOR Future> MutationLogReader::getNext_impl(Mutatio namespace { // UNIT TESTS TEST_CASE("/fdbclient/mutationlogreader/VersionKeyRefConversion") { - Key prefix = LiteralStringRef("foos"); + Key prefix = "foos"_sr; ASSERT(keyRefToVersion(versionToKey(0, prefix), prefix.size()) == 0); ASSERT(keyRefToVersion(versionToKey(1, prefix), prefix.size()) == 1); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 089640cc77..85de37081a 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -32,14 +32,14 @@ #include #include "boost/algorithm/string.hpp" -#include "fdbclient/Tenant.h" -#include "fdbrpc/TenantInfo.h" +#include "flow/CodeProbe.h" #include "fmt/format.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" +#include "fdbrpc/TenantInfo.h" #include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/AnnotateActor.h" @@ -47,6 +47,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/ClusterConnectionFile.h" +#include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/GlobalConfig.actor.h" @@ -64,6 +65,7 @@ #include "fdbclient/SpecialKeySpace.actor.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/SystemData.h" +#include "fdbclient/Tenant.h" #include "fdbclient/TenantSpecialKeys.actor.h" #include "fdbclient/TransactionLineage.h" #include "fdbclient/versions.h" @@ -105,6 +107,8 @@ #endif #include "flow/actorcompiler.h" // This must be the last #include. +FDB_DEFINE_BOOLEAN_PARAM(CacheResult); + extern const char* getSourceVersion(); namespace { @@ -155,8 +159,8 @@ NetworkOptions::NetworkOptions() supportedVersions(new ReferencedObject>>()), runLoopProfilingEnabled(false), primaryClient(true) {} -static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); -static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); +static const Key CLIENT_LATENCY_INFO_PREFIX = "client_latency/"_sr; +static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = "client_latency_counter/"_sr; void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { auto result = tssMapping.find(ssi.id()); @@ -170,14 +174,8 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe tssMetrics[tssi.id()] = metrics; tssMapping[ssi.id()] = tssi; } else { - if (result->second.id() == tssi.id()) { - metrics = tssMetrics[tssi.id()]; - } else { - CODE_PROBE(true, "SS now maps to new TSS! This will probably never happen in practice"); - tssMetrics.erase(result->second.id()); - metrics = makeReference(); - tssMetrics[tssi.id()] = metrics; - } + ASSERT(result->second.id() == tssi.id()); + metrics = tssMetrics[tssi.id()]; result->second = tssi; } @@ -227,14 +225,61 @@ void DatabaseContext::addSSIdTagMapping(const UID& uid, const Tag& tag) { ssidTagMapping[uid] = tag; } +void DatabaseContext::getLatestCommitVersionForSSID(const UID& ssid, Tag& tag, Version& commitVersion) { + // initialization + tag = invalidTag; + commitVersion = invalidVersion; + + auto iter = ssidTagMapping.find(ssid); + if (iter != ssidTagMapping.end()) { + tag = iter->second; + + if (ssVersionVectorCache.hasVersion(tag)) { + commitVersion = ssVersionVectorCache.getVersion(tag); + } + } +} + +void DatabaseContext::getLatestCommitVersion(const StorageServerInterface& ssi, + Version readVersion, + VersionVector& latestCommitVersion) { + latestCommitVersion.clear(); + + if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { + return; + } + + // Error checking (based on the assumption that the read version was not obtained + // from the client's grv cache). + if (readVersion > ssVersionVectorCache.getMaxVersion()) { + TraceEvent(SevError, "ReadVersionExceedsVersionVectorMax") + .detail("ReadVersion", readVersion) + .detail("VersionVector", ssVersionVectorCache.toString()); + if (g_network->isSimulated()) { + ASSERT(false); + } else { + return; // Do not return a stale commit version in production. + } + } + + Tag tag = invalidTag; + Version commitVersion = invalidVersion; + getLatestCommitVersionForSSID(ssi.id(), tag, commitVersion); + + if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < readVersion) { + latestCommitVersion.setVersion(tag, commitVersion); + } +} + void DatabaseContext::getLatestCommitVersions(const Reference& locationInfo, Version readVersion, Reference info, VersionVector& latestCommitVersions) { latestCommitVersions.clear(); - if (info->debugID.present()) { - g_traceBatch.addEvent("TransactionDebug", info->debugID.get().first(), "NativeAPI.getLatestCommitVersions"); + if (info->readOptions.present() && info->readOptions.get().debugID.present()) { + g_traceBatch.addEvent( + "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions"); } if (!info->readVersionObtainedFromGrvProxy) { @@ -258,24 +303,20 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc std::map> versionMap; // order the versions to be returned for (int i = 0; i < locationInfo->locations()->size(); i++) { - bool updatedVersionMap = false; - Version commitVersion = invalidVersion; Tag tag = invalidTag; - auto iter = ssidTagMapping.find(locationInfo->locations()->getId(i)); - if (iter != ssidTagMapping.end()) { - tag = iter->second; - if (ssVersionVectorCache.hasVersion(tag)) { - commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version - if (commitVersion < readVersion) { - updatedVersionMap = true; - versionMap[commitVersion].insert(tag); - } - } + Version commitVersion = invalidVersion; // latest commit version + getLatestCommitVersionForSSID(locationInfo->locations()->getId(i), tag, commitVersion); + + bool updatedVersionMap = false; + if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < readVersion) { + updatedVersionMap = true; + versionMap[commitVersion].insert(tag); } - // commitVersion == readVersion is common, do not log. - if (!updatedVersionMap && commitVersion != readVersion) { + + // Do not log if commitVersion >= readVersion. + if (!updatedVersionMap && commitVersion == invalidVersion) { TraceEvent(SevDebug, "CommitVersionNotFoundForSS") - .detail("InSSIDMap", iter != ssidTagMapping.end() ? 1 : 0) + .detail("InSSIDMap", tag != invalidTag ? 1 : 0) .detail("Tag", tag) .detail("CommitVersion", commitVersion) .detail("ReadVersion", readVersion) @@ -603,7 +644,8 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { loop { wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); - if (!g_network->isSimulated()) { + bool logTraces = !g_network->isSimulated() || BUGGIFY_WITH_PROB(0.01); + if (logTraces) { TraceEvent ev("TransactionMetrics", cx->dbId); ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) @@ -656,6 +698,19 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { cx->bgLatencies.clear(); cx->bgGranulesPerRequest.clear(); + if (cx->usedAnyChangeFeeds && logTraces) { + TraceEvent feedEv("ChangeFeedClientMetrics", cx->dbId); + + feedEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) + .detail("Cluster", + cx->getConnectionRecord() + ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() + : "") + .detail("Internal", cx->internal); + + cx->ccFeed.logToTraceEvent(feedEv); + } + lastLogged = now(); } } @@ -1144,7 +1199,7 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); if (quarantine) { - tr->set(tssQuarantineKeyFor(data.first), LiteralStringRef("")); + tr->set(tssQuarantineKeyFor(data.first), ""_sr); } else { tr->clear(serverTagKeyFor(data.first)); } @@ -1272,7 +1327,7 @@ void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, std::unique_ptr&& impl, int deprecatedVersion) { // if deprecated, add the implementation when the api version is less than the deprecated version - if (deprecatedVersion == -1 || apiVersion < deprecatedVersion) { + if (deprecatedVersion == -1 || apiVersion.version() < deprecatedVersion) { specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get()); specialKeySpaceModules.push_back(std::move(impl)); } @@ -1295,12 +1350,17 @@ struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { }); } - SingleSpecialKeyImpl(KeyRef k, const std::function>(ReadYourWritesTransaction*)>& f) - : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f) {} + SingleSpecialKeyImpl(KeyRef k, + const std::function>(ReadYourWritesTransaction*)>& f, + bool supportsTenants = false) + : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f), tenantSupport(supportsTenants) {} + + bool supportsTenants() const override { return tenantSupport; }; private: Key k; std::function>(ReadYourWritesTransaction*)> f; + bool tenantSupport; }; class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl { @@ -1315,7 +1375,7 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange RangeResult result; if (CLIENT_BUGGIFY) return result; - if (kr.contains(LiteralStringRef("\xff\xff/metrics/health/aggregate")) && metrics.worstStorageDurabilityLag != 0) { + if (kr.contains("\xff\xff/metrics/health/aggregate"_sr) && metrics.worstStorageDurabilityLag != 0) { json_spirit::mObject statsObj; statsObj["batch_limited"] = metrics.batchLimited; statsObj["tps_limit"] = metrics.tpsLimit; @@ -1327,15 +1387,13 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); - result.push_back(result.arena(), KeyValueRef(LiteralStringRef("\xff\xff/metrics/health/aggregate"), bytes)); + result.push_back(result.arena(), KeyValueRef("\xff\xff/metrics/health/aggregate"_sr, bytes)); } // tlog stats { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, logStats] : metrics.tLogQueue) { - StringRef k{ - StringRef(uid.toString()).withPrefix(LiteralStringRef("\xff\xff/metrics/health/log/"), result.arena()) - }; + StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/log/"_sr, result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } @@ -1357,8 +1415,7 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, storageStats] : metrics.storageStats) { - StringRef k{ StringRef(uid.toString()) - .withPrefix(LiteralStringRef("\xff\xff/metrics/health/storage/"), result.arena()) }; + StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/storage/"_sr, result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } @@ -1384,10 +1441,9 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange ACTOR static Future healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) { HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics( - /*detailed ("per process")*/ kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/storage/"), - LiteralStringRef("\xff\xff/metrics/health/storage0"))) || - kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/log/"), - LiteralStringRef("\xff\xff/metrics/health/log0"))))); + /*detailed ("per process")*/ kr.intersects( + KeyRangeRef("\xff\xff/metrics/health/storage/"_sr, "\xff\xff/metrics/health/storage0"_sr)) || + kr.intersects(KeyRangeRef("\xff\xff/metrics/health/log/"_sr, "\xff\xff/metrics/health/log0"_sr)))); return healthMetricsToKVPairs(metrics, kr); } @@ -1425,7 +1481,7 @@ DatabaseContext::DatabaseContext(Reference defaultTenant) : lockAware(lockAware), switchable(switchable), connectionRecord(connectionRecord), proxyProvisional(false), @@ -1460,12 +1516,16 @@ DatabaseContext::DatabaseContext(ReferenceSHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { @@ -1481,7 +1541,7 @@ DatabaseContext::DatabaseContext(ReferenceMETADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; - snapshotRywEnabled = apiVersionAtLeast(300) ? 1 : 0; + snapshotRywEnabled = apiVersion.hasSnapshotRYW() ? 1 : 0; logger = databaseLogger(this) && tssLogger(this); locationCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM @@ -1489,8 +1549,8 @@ DatabaseContext::DatabaseContext(ReferenceisSimulated() ? CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE_SIM : CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE; - getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted")); - getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted")); + getValueSubmitted.init("NativeAPI.GetValueSubmitted"_sr); + getValueCompleted.init("NativeAPI.GetValueCompleted"_sr); clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger); tssMismatchHandler = handleTssMismatches(this); @@ -1500,34 +1560,7 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); globalConfig = std::make_unique(this); - if (apiVersionAtLeast(720)) { - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::CLUSTERID, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - LiteralStringRef("\xff\xff/cluster_id"), [](ReadYourWritesTransaction* ryw) -> Future> { - try { - if (ryw->getDatabase().getPtr()) { - return map(getClusterId(ryw->getDatabase()), - [](UID id) { return Optional(StringRef(id.toString())); }); - } - } catch (Error& e) { - return e; - } - return Optional(); - })); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique>(SpecialKeySpace::getManagementApiCommandRange("tenant"))); - } - if (apiVersionAtLeast(710) && !apiVersionAtLeast(720)) { - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique>(SpecialKeySpace::getManagementApiCommandRange("tenantmap"))); - } - if (apiVersionAtLeast(700)) { + if (apiVersion.version() >= 700) { registerSpecialKeysImpl(SpecialKeySpace::MODULE::ERRORMSG, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( @@ -1537,12 +1570,13 @@ DatabaseContext::DatabaseContext(Reference(ryw->getSpecialKeySpaceErrorMsg().get()); else return Optional(); - })); + }, + true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("options/"), LiteralStringRef("options0")) + KeyRangeRef("options/"_sr, "options0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, @@ -1564,31 +1598,31 @@ DatabaseContext::DatabaseContext(Reference( - KeyRangeRef(LiteralStringRef("in_progress_exclusion/"), LiteralStringRef("in_progress_exclusion0")) + KeyRangeRef("in_progress_exclusion/"_sr, "in_progress_exclusion0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("process/class_type/"), LiteralStringRef("process/class_type0")) + KeyRangeRef("process/class_type/"_sr, "process/class_type0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - KeyRangeRef(LiteralStringRef("process/class_source/"), LiteralStringRef("process/class_source0")) + KeyRangeRef("process/class_source/"_sr, "process/class_source0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("db_locked")) + singleKeyRange("db_locked"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("consistency_check_suspended")) + singleKeyRange("consistency_check_suspended"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::GLOBALCONFIG, @@ -1602,44 +1636,44 @@ DatabaseContext::DatabaseContext(Reference( - KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0")) + KeyRangeRef("coordinators/"_sr, "coordinators0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - singleKeyRange(LiteralStringRef("auto_coordinators")) + singleKeyRange("auto_coordinators"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("min_required_commit_version")) + singleKeyRange("min_required_commit_version"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("version_epoch")) + singleKeyRange("version_epoch"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) + KeyRangeRef("profiling/"_sr, "profiling0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)), /* deprecated */ 720); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0")) + KeyRangeRef("maintenance/"_sr, "maintenance0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) + KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::ACTORLINEAGE, @@ -1650,7 +1684,7 @@ DatabaseContext::DatabaseContext(Reference( SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF))); } - if (apiVersionAtLeast(630)) { + if (apiVersion.version() >= 630) { registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(conflictingKeysRange)); @@ -1663,20 +1697,18 @@ DatabaseContext::DatabaseContext(Reference(ddStatsRange)); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::METRICS, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/"), - LiteralStringRef("\xff\xff/metrics/health0")))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::WORKERINTERFACE, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(KeyRangeRef( - LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("\xff\xff/metrics/health/"_sr, "\xff\xff/metrics/health0"_sr))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::WORKERINTERFACE, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::STATUSJSON, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - LiteralStringRef("\xff\xff/status/json"), + "\xff\xff/status/json"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { ++ryw->getDatabase()->transactionStatusRequests; @@ -1684,11 +1716,12 @@ DatabaseContext::DatabaseContext(Reference(); } - })); + }, + true)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERFILEPATH, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - LiteralStringRef("\xff\xff/cluster_file_path"), + "\xff\xff/cluster_file_path"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && @@ -1701,13 +1734,14 @@ DatabaseContext::DatabaseContext(Reference(); - })); + }, + true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONNECTIONSTRING, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - LiteralStringRef("\xff\xff/connection_string"), + "\xff\xff/connection_string"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { @@ -1719,7 +1753,30 @@ DatabaseContext::DatabaseContext(Reference(); - })); + }, + true)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERID, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + "\xff\xff/cluster_id"_sr, + [](ReadYourWritesTransaction* ryw) -> Future> { + try { + if (ryw->getDatabase().getPtr()) { + return map(getClusterId(ryw->getDatabase()), [](UID id) { + return Optional(StringRef(id.toString())); + }); + } + } catch (Error& e) { + return e; + } + return Optional(); + }, + true)); + + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenant"))); } throttleExpirer = recurring([this]() { expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL); @@ -1759,9 +1816,13 @@ DatabaseContext::DatabaseContext(const Error& err) transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), - transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), latencies(1000), readLatencies(1000), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc), + bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), + feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), + feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), + feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), - bgGranulesPerRequest(1000), transactionTracingSample(false), + bgGranulesPerRequest(1000), sharedStatePtr(nullptr), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} @@ -1806,6 +1867,9 @@ DatabaseContext::~DatabaseContext() { for (auto& it : notAtLatestChangeFeeds) { it.second->context = nullptr; } + for (auto& it : changeFeedUpdaters) { + it.second->context = nullptr; + } TraceEvent("DatabaseContextDestructed", dbId).backtrace(); } @@ -2288,6 +2352,13 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } +Database Database::createSimulatedExtraDatabase(std::string connectionString, Optional defaultTenant) { + auto extraFile = makeReference(ClusterConnectionString(connectionString)); + Database db = Database::createDatabase(extraFile, ApiVersion::LATEST_VERSION); + db->defaultTenant = defaultTenant; + return db; +} + Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { const auto it = watchMap.find(std::make_pair(tenantId, key)); if (it == watchMap.end()) @@ -2461,7 +2532,7 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional valu ASSERT(value.present()); Standalone> supportedVersions; - std::vector supportedVersionsStrings = value.get().splitAny(LiteralStringRef(";")); + std::vector supportedVersionsStrings = value.get().splitAny(";"_sr); for (StringRef versionString : supportedVersionsStrings) { #ifdef ADDRESS_SANITIZER __lsan_disable(); @@ -2641,7 +2712,7 @@ bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const { if (proxy.id() == proxyId) return true; } - CODE_PROBE(true, "stale GRV proxy detected"); + CODE_PROBE(true, "stale GRV proxy detected", probe::decoration::rare); return false; } @@ -2967,7 +3038,7 @@ Future getKeyLocation(Reference trState, key, member, trState->spanContext, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, isBackward, version); @@ -3108,7 +3179,7 @@ Future> getKeyRangeLocations(ReferencespanContext, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, version); @@ -3130,16 +3201,16 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange state Version version = wait(fVersion); loop { - std::vector locations = - wait(getKeyRangeLocations_internal(trState->cx, - trState->getTenantInfo(), - keys, - CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, - Reverse::False, - trState->spanContext, - trState->debugID, - trState->useProvisionalProxies, - version)); + std::vector locations = wait(getKeyRangeLocations_internal( + trState->cx, + trState->getTenantInfo(), + keys, + CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, + Reverse::False, + trState->spanContext, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), + trState->useProvisionalProxies, + version)); totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || @@ -3297,12 +3368,16 @@ ACTOR Future> getValue(Reference trState, state uint64_t startTime; state double startTimeD; state VersionVector ssLatestCommitVersions; + state Optional readOptions = trState->readOptions; + trState->cx->getLatestCommitVersions(locationInfo.locations, ver, trState, ssLatestCommitVersions); try { - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { getValueID = nondeterministicRandom()->randomUniqueID(); + readOptions.get().debugID = getValueID; - g_traceBatch.addAttach("GetValueAttachID", trState->debugID.get().first(), getValueID.get().first()); + g_traceBatch.addAttach( + "GetValueAttachID", trState->readOptions.get().debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); @@ -3335,7 +3410,7 @@ ACTOR Future> getValue(Reference trState, ver, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), - getValueID, + readOptions, ssLatestCommitVersions), TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, @@ -3410,12 +3485,16 @@ ACTOR Future getKey(Reference trState, UseTenant useTenant = UseTenant::True) { wait(success(version)); - state Optional getKeyID = Optional(); - state Span span("NAPI:getKey"_loc, trState->spanContext); - if (trState->debugID.present()) { - getKeyID = nondeterministicRandom()->randomUniqueID(); + state Optional getKeyID; + state Optional readOptions = trState->readOptions; - g_traceBatch.addAttach("GetKeyAttachID", trState->debugID.get().first(), getKeyID.get().first()); + state Span span("NAPI:getKey"_loc, trState->spanContext); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + getKeyID = nondeterministicRandom()->randomUniqueID(); + readOptions.get().debugID = getKeyID; + + g_traceBatch.addAttach( + "GetKeyAttachID", trState->readOptions.get().debugID.get().first(), getKeyID.get().first()); g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), @@ -3458,7 +3537,7 @@ ACTOR Future getKey(Reference trState, k, version.get(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), - getKeyID, + readOptions, ssLatestCommitVersions); req.arena.dependsOn(k.arena()); @@ -3598,6 +3677,9 @@ ACTOR Future watchValue(Database cx, Reference p parameters->useProvisionalProxies, Reverse::False, parameters->version)); + if (parameters->tenant.tenantId != locationInfo.tenantEntry.id) { + throw tenant_not_found(); + } try { state Optional watchValueID = Optional(); @@ -3654,7 +3736,7 @@ ACTOR Future watchValue(Database cx, Reference p } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off CODE_PROBE(e.code() == error_code_watch_cancelled, "Too many watches on the storage server, poll for changes instead"); - CODE_PROBE(e.code() == error_code_process_behind, "The storage servers are all behind"); + CODE_PROBE(e.code() == error_code_process_behind, "The storage servers are all behind", probe::decoration::rare); // clang-format on wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, parameters->taskID)); } else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case @@ -3923,13 +4005,15 @@ Future getExactRange(Reference trState, // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; + + req.options = trState->readOptions; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.Before"); - /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->debugID.get()) + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.Before"); + /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("ReqLimit", req.limit) @@ -3959,9 +4043,10 @@ Future getExactRange(Reference trState, ++trState->cx->transactionPhysicalReadsCompleted; throw; } - if (trState->debugID.present()) - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.After"); output.arena().dependsOn(rep.arena); output.append(output.arena(), rep.data.begin(), rep.data.size()); @@ -4289,7 +4374,7 @@ Future getRange(Reference trState, req.arena.dependsOn(mapper.arena()); setMatchIndex(req, matchIndex); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); - req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); + req.options = trState->readOptions; req.version = readVersion; trState->cx->getLatestCommitVersions( @@ -4327,13 +4412,13 @@ Future getRange(Reference trState, ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; req.spanContext = span.context; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Before"); - /*TraceEvent("TransactionDebugGetRangeInfo", trState->debugID.get()) + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getRange.Before"); + /*TraceEvent("TransactionDebugGetRangeInfo", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("OriginalBegin", originalBegin.toString()) @@ -4372,11 +4457,11 @@ Future getRange(Reference trState, throw; } - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { g_traceBatch.addEvent("TransactionDebug", - trState->debugID.get().first(), + trState->readOptions.get().debugID.get().first(), "NativeAPI.getRange.After"); //.detail("SizeOf", rep.data.size()); - /*TraceEvent("TransactionDebugGetRangeDone", trState->debugID.get()) + /*TraceEvent("TransactionDebugGetRangeDone", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("RepIsMore", rep.more) @@ -4488,10 +4573,11 @@ Future getRange(Reference trState, } } catch (Error& e) { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Error"); - TraceEvent("TransactionDebugError", trState->debugID.get()).error(e); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getRange.Error"); + TraceEvent("TransactionDebugError", trState->readOptions.get().debugID.get()).error(e); } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && readVersion == latestVersion)) { @@ -4638,13 +4724,12 @@ static Future tssStreamComparison(Request request, // FIXME: this code is pretty much identical to LoadBalance.h // TODO could add team check logic in if we added synchronous way to turn this into a fixed getRange request // and send it to the whole team and compare? I think it's fine to skip that for streaming though - CODE_PROBE(ssEndOfStream != tssEndOfStream, "SS or TSS stream finished early!"); // skip tss comparison if both are end of stream if ((!ssEndOfStream || !tssEndOfStream) && !TSS_doCompare(ssReply.get(), tssReply.get())) { CODE_PROBE(true, "TSS mismatch in stream comparison"); TraceEvent mismatchEvent( - (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations) + (g_network->isSimulated() && g_simulator->tssMode == ISimulator::TSSMode::EnabledDropMutations) ? SevWarnAlways : SevError, TSS_mismatchTraceName(request)); @@ -4666,7 +4751,7 @@ static Future tssStreamComparison(Request request, // record a summarized trace event instead TraceEvent summaryEvent((g_network->isSimulated() && - g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations) + g_simulator->tssMode == ISimulator::TSSMode::EnabledDropMutations) ? SevWarnAlways : SevError, TSS_mismatchTraceName(request)); @@ -4743,9 +4828,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, req.spanContext = spanContext; req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limitBytes = std::numeric_limits::max(); - // leaving the flag off for now to prevent data fetches stall under heavy load - // it is used to inform the storage that the rangeRead is for Fetch - // req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); + req.options = trState->readOptions; + trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); @@ -4756,12 +4840,12 @@ ACTOR Future getRangeStreamFragment(Reference trState, // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.RangeStream.Before"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.RangeStream.Before"); } ++trState->cx->transactionPhysicalReads; state GetKeyValuesStreamReply rep; @@ -4855,9 +4939,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, } rep = GetKeyValuesStreamReply(); } - if (trState->debugID.present()) - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.After"); RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena); if (tssDuplicateStream.present() && !tssDuplicateStream.get().done()) { @@ -4965,7 +5050,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, throw; } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || - e.code() == error_code_connection_failed) { + e.code() == error_code_connection_failed || e.code() == error_code_request_maybe_delivered) { const KeyRangeRef& range = locations[shard].range; if (reverse) @@ -5268,6 +5353,44 @@ Future populateAndGetTenant(Reference trState, Key } } +// Restarts a watch after a database switch +ACTOR Future restartWatch(Database cx, + TenantInfo tenantInfo, + Key key, + Optional value, + TagSet tags, + SpanContext spanContext, + TaskPriority taskID, + Optional debugID, + UseProvisionalProxies useProvisionalProxies) { + // The ID of the tenant may be different on the cluster that we switched to, so obtain the new ID + if (tenantInfo.name.present()) { + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(cx, + tenantInfo, + key, + &StorageServerInterface::watchValue, + spanContext, + debugID, + useProvisionalProxies, + Reverse::False, + latestVersion)); + tenantInfo.tenantId = locationInfo.tenantEntry.id; + } + + wait(watchValueMap(cx->minAcceptableReadVersion, + tenantInfo, + key, + value, + cx, + tags, + spanContext, + taskID, + debugID, + useProvisionalProxies)); + + return Void(); +} + // FIXME: This seems pretty horrible. Now a Database can't die until all of its watches do... ACTOR Future watch(Reference watch, Database cx, @@ -5295,16 +5418,15 @@ ACTOR Future watch(Reference watch, when(wait(cx->connectionFileChanged())) { CODE_PROBE(true, "Recreated a watch after switch"); cx->clearWatchMetadata(); - watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion, - tenantInfo, - watch->key, - watch->value, - cx, - tags, - spanContext, - taskID, - debugID, - useProvisionalProxies); + watch->watchFuture = restartWatch(cx, + tenantInfo, + watch->key, + watch->value, + tags, + spanContext, + taskID, + debugID, + useProvisionalProxies); } } } @@ -5336,7 +5458,7 @@ Future Transaction::watch(Reference watch) { trState->options.readTags, trState->spanContext, trState->taskID, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies); } @@ -5526,7 +5648,7 @@ Future Transaction::getRange(const KeySelector& begin, // A method for streaming data from the storage server that is more efficient than getRange when reading large amounts // of data -Future Transaction::getRangeStream(const PromiseStream& results, +Future Transaction::getRangeStream(PromiseStream& results, const KeySelector& begin, const KeySelector& end, GetRangeLimits limits, @@ -5540,18 +5662,18 @@ Future Transaction::getRangeStream(const PromiseStream& resul KeySelector b = begin; if (b.orEqual) { - CODE_PROBE(true, "Native stream begin orEqual==true"); + CODE_PROBE(true, "Native stream begin orEqual==true", probe::decoration::rare); b.removeOrEqual(b.arena()); } KeySelector e = end; if (e.orEqual) { - CODE_PROBE(true, "Native stream end orEqual==true"); + CODE_PROBE(true, "Native stream end orEqual==true", probe::decoration::rare); e.removeOrEqual(e.arena()); } if (b.offset >= e.offset && b.getKey() >= e.getKey()) { - CODE_PROBE(true, "Native stream range inverted"); + CODE_PROBE(true, "Native stream range inverted", probe::decoration::rare); results.sendError(end_of_stream()); return Void(); } @@ -5565,7 +5687,7 @@ Future Transaction::getRangeStream(const PromiseStream& resul ::getRangeStream(trState, results, getReadVersion(), b, e, limits, conflictRange, snapshot, reverse), results); } -Future Transaction::getRangeStream(const PromiseStream& results, +Future Transaction::getRangeStream(PromiseStream& results, const KeySelector& begin, const KeySelector& end, int limit, @@ -5602,7 +5724,7 @@ void Transaction::addReadConflictRange(KeyRangeRef const& keys) { void Transaction::makeSelfConflicting() { BinaryWriter wr(Unversioned()); - wr.serializeBytes(LiteralStringRef("\xFF/SC/")); + wr.serializeBytes("\xFF/SC/"_sr); wr << deterministicRandom()->randomUniqueID(); auto r = singleKeyRange(wr.toValue(), tr.arena); tr.transaction.read_conflict_ranges.push_back(tr.arena, r); @@ -6004,16 +6126,17 @@ void Transaction::setupWatches() { Future watchVersion = getCommittedVersion() > 0 ? getCommittedVersion() : getReadVersion(); for (int i = 0; i < watches.size(); ++i) - watches[i]->setWatch(watchValueMap(watchVersion, - trState->getTenantInfo(), - watches[i]->key, - watches[i]->value, - trState->cx, - trState->options.readTags, - trState->spanContext, - trState->taskID, - trState->debugID, - trState->useProvisionalProxies)); + watches[i]->setWatch( + watchValueMap(watchVersion, + trState->getTenantInfo(), + watches[i]->key, + watches[i]->value, + trState->cx, + trState->options.readTags, + trState->spanContext, + trState->taskID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), + trState->useProvisionalProxies)); watches.clear(); } catch (Error&) { @@ -6102,30 +6225,46 @@ ACTOR Future> estimateCommitCosts(Referen // TODO: send the prefix as part of the commit request and ship it all the way // through to the storage servers void applyTenantPrefix(CommitTransactionRequest& req, Key tenantPrefix) { + VectorRef updatedMutations; + updatedMutations.reserve(req.arena, req.transaction.mutations.size()); for (auto& m : req.transaction.mutations) { + StringRef param1 = m.param1; + StringRef param2 = m.param2; if (m.param1 != metadataVersionKey) { - m.param1 = m.param1.withPrefix(tenantPrefix, req.arena); + param1 = m.param1.withPrefix(tenantPrefix, req.arena); if (m.type == MutationRef::ClearRange) { - m.param2 = m.param2.withPrefix(tenantPrefix, req.arena); + param2 = m.param2.withPrefix(tenantPrefix, req.arena); } else if (m.type == MutationRef::SetVersionstampedKey) { - uint8_t* key = mutateString(m.param1); - int* offset = reinterpret_cast(&key[m.param1.size() - 4]); + uint8_t* key = mutateString(param1); + int* offset = reinterpret_cast(&key[param1.size() - 4]); *offset += tenantPrefix.size(); } } + updatedMutations.push_back(req.arena, MutationRef(MutationRef::Type(m.type), param1, param2)); } + req.transaction.mutations = updatedMutations; - for (auto& rc : req.transaction.read_conflict_ranges) { + VectorRef updatedReadConflictRanges; + updatedReadConflictRanges.reserve(req.arena, req.transaction.read_conflict_ranges.size()); + for (auto const& rc : req.transaction.read_conflict_ranges) { if (rc.begin != metadataVersionKey) { - rc = rc.withPrefix(tenantPrefix, req.arena); + updatedReadConflictRanges.push_back(req.arena, rc.withPrefix(tenantPrefix, req.arena)); + } else { + updatedReadConflictRanges.push_back(req.arena, rc); } } + req.transaction.read_conflict_ranges = updatedReadConflictRanges; + VectorRef updatedWriteConflictRanges; + updatedWriteConflictRanges.reserve(req.arena, req.transaction.write_conflict_ranges.size()); for (auto& wc : req.transaction.write_conflict_ranges) { if (wc.begin != metadataVersionKey) { - wc = wc.withPrefix(tenantPrefix, req.arena); + updatedWriteConflictRanges.push_back(req.arena, wc.withPrefix(tenantPrefix, req.arena)); + } else { + updatedWriteConflictRanges.push_back(req.arena, wc); } } + req.transaction.write_conflict_ranges = updatedWriteConflictRanges; } ACTOR static Future tryCommit(Reference trState, @@ -6134,7 +6273,7 @@ ACTOR static Future tryCommit(Reference trState, state TraceInterval interval("TransactionCommit"); state double startTime = now(); state Span span("NAPI:tryCommit"_loc, trState->spanContext); - state Optional debugID = trState->debugID; + state Optional debugID = trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(); state TenantPrefixPrepended tenantPrefixPrepended = TenantPrefixPrepended::False; if (debugID.present()) { TraceEvent(interval.begin()).detail("Parent", debugID.get()); @@ -6538,10 +6677,10 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional(value.get().printable(), TransactionLogInfo::DONT_LOG); trState->trLogInfo->maxFieldLength = trState->options.maxTransactionLoggingFieldLength; } - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { TraceEvent(SevInfo, "TransactionBeingTraced") .detail("DebugTransactionID", trState->trLogInfo->identifier) - .detail("ServerTraceID", trState->debugID.get()); + .detail("ServerTraceID", trState->readOptions.get().debugID.get()); } break; @@ -6573,10 +6712,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, OptionalrandomUniqueID()); - if (trState->trLogInfo && !trState->trLogInfo->identifier.empty()) { + if (trState->trLogInfo && !trState->trLogInfo->identifier.empty() && trState->readOptions.present() && + trState->readOptions.get().debugID.present()) { TraceEvent(SevInfo, "TransactionBeingTraced") .detail("DebugTransactionID", trState->trLogInfo->identifier) - .detail("ServerTraceID", trState->debugID.get()); + .detail("ServerTraceID", trState->readOptions.get().debugID.get()); } break; @@ -6657,6 +6797,9 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionalcx->sharedStatePtr) { + throw invalid_option(); + } if (trState->numErrors == 0) { trState->options.useGrvCache = true; } @@ -6759,10 +6902,12 @@ ACTOR Future getConsistentReadVersion(SpanContext parentSpa if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled && e.code() != error_code_grv_proxy_memory_limit_exceeded) TraceEvent(SevError, "GetConsistentReadVersionError").error(e); - if ((e.code() == error_code_batch_transaction_throttled || - e.code() == error_code_grv_proxy_memory_limit_exceeded) && - !cx->apiVersionAtLeast(630)) { + if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) { wait(delayJittered(5.0)); + } else if (e.code() == error_code_grv_proxy_memory_limit_exceeded) { + // FIXME(xwang): the better way is to let this error broadcast to transaction.onError(e), otherwise the + // txn->cx counter doesn't make sense + wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); } else { throw; } @@ -6780,26 +6925,22 @@ ACTOR Future readVersionBatcher(DatabaseContext* cx, state Future timeout; state Optional debugID; state bool send_batch; - state Reference batchSizeDist = Histogram::getHistogram(LiteralStringRef("GrvBatcher"), - LiteralStringRef("ClientGrvBatchSize"), - Histogram::Unit::countLinear, - 0, - CLIENT_KNOBS->MAX_BATCH_SIZE * 2); + state Reference batchSizeDist = Histogram::getHistogram( + "GrvBatcher"_sr, "ClientGrvBatchSize"_sr, Histogram::Unit::countLinear, 0, CLIENT_KNOBS->MAX_BATCH_SIZE * 2); state Reference batchIntervalDist = - Histogram::getHistogram(LiteralStringRef("GrvBatcher"), - LiteralStringRef("ClientGrvBatchInterval"), + Histogram::getHistogram("GrvBatcher"_sr, + "ClientGrvBatchInterval"_sr, Histogram::Unit::microseconds, 0, CLIENT_KNOBS->GRV_BATCH_TIMEOUT * 1000000 * 2); - state Reference grvReplyLatencyDist = Histogram::getHistogram( - LiteralStringRef("GrvBatcher"), LiteralStringRef("ClientGrvReplyLatency"), Histogram::Unit::microseconds); + state Reference grvReplyLatencyDist = + Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvReplyLatency"_sr, Histogram::Unit::microseconds); state double lastRequestTime = now(); state TransactionTagMap tags; // dynamic batching state PromiseStream replyTimes; - state PromiseStream _errorStream; state double batchTime = 0; state Span span("NAPI:readVersionBatcher"_loc); loop { @@ -7047,7 +7188,9 @@ Future Transaction::getReadVersion(uint32_t flags) { Location location = "NAPI:getReadVersion"_loc; SpanContext spanContext = generateSpanID(trState->cx->transactionTracingSample, trState->spanContext); - auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, trState->debugID); + Optional versionDebugID = + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(); + auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, versionDebugID); batcher.stream.send(req); trState->startTime = now(); readVersion = extractReadVersion(trState, location, spanContext, req.reply.getFuture(), metadataVersion); @@ -7207,7 +7350,8 @@ Future Transaction::onError(Error const& e) { if (e.code() == error_code_not_committed || e.code() == error_code_commit_unknown_result || e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded || e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind || - e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled) { + e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled || + e.code() == error_code_blob_granule_request_failed) { if (e.code() == error_code_not_committed) ++trState->cx->transactionsNotCommitted; else if (e.code() == error_code_commit_unknown_result) @@ -7676,14 +7820,15 @@ ACTOR Future blobGranuleGetTenantEntry(Transaction* self, Key ra Optional cachedLocationInfo = self->trState->cx->getCachedLocation(self->getTenant().get(), rangeStartKey, Reverse::False); if (!cachedLocationInfo.present()) { - KeyRangeLocationInfo l = wait(getKeyLocation_internal(self->trState->cx, - self->trState->getTenantInfo(AllowInvalidTenantID::True), - rangeStartKey, - self->trState->spanContext, - self->trState->debugID, - self->trState->useProvisionalProxies, - Reverse::False, - latestVersion)); + KeyRangeLocationInfo l = wait(getKeyLocation_internal( + self->trState->cx, + self->trState->getTenantInfo(AllowInvalidTenantID::True), + rangeStartKey, + self->trState->spanContext, + self->trState->readOptions.present() ? self->trState->readOptions.get().debugID : Optional(), + self->trState->useProvisionalProxies, + Reverse::False, + latestVersion)); self->trState->trySetTenantId(l.tenantEntry.id); return l.tenantEntry; } else { @@ -7724,23 +7869,24 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans if (tenantPrefix.present()) { state Standalone mappingPrefix = tenantPrefix.get().withPrefix(blobGranuleMappingKeys.begin); - // basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with - // UseTenant::False + // basically krmGetRangeUnaligned, but enable it to not use tenant without RAW_ACCESS by doing manual + // getRange with UseTenant::False GetRangeLimits limits(2 * rangeLimit + 2); limits.minRows = 2; + RangeResult rawMapping = wait(getRange(self->trState, self->getReadVersion(), lastLessOrEqual(keyRange.begin.withPrefix(mappingPrefix)), - firstGreaterThan(keyRange.end.withPrefix(mappingPrefix)), + KeySelectorRef(keyRange.end.withPrefix(mappingPrefix), false, +2), limits, Reverse::False, UseTenant::False)); // strip off mapping prefix - blobGranuleMapping = krmDecodeRanges(mappingPrefix, currentRange, rawMapping); + blobGranuleMapping = krmDecodeRanges(mappingPrefix, currentRange, rawMapping, false); } else { wait(store( blobGranuleMapping, - krmGetRanges( + krmGetRangesUnaligned( self, blobGranuleMappingKeys.begin, currentRange, 1000, GetRangeLimits::BYTE_LIMIT_UNLIMITED))); } @@ -7777,7 +7923,11 @@ ACTOR Future>> readBlobGranulesActor( KeyRange range, Version begin, Optional read, - Version* readVersionOut) { // read not present is "use transaction version" + Version* readVersionOut, + int chunkLimit, + bool summarize) { // read not present is "use transaction version" + + ASSERT(chunkLimit > 0); state RangeResult blobGranuleMapping; state Key granuleStartKey; @@ -7869,7 +8019,6 @@ ACTOR Future>> readBlobGranulesActor( fmt::print("Key range [{0} - {1}) missing worker assignment!\n", granuleStartKey.printable(), granuleEndKey.printable()); - // TODO probably new exception type instead } throw blob_granule_transaction_too_old(); } @@ -7896,11 +8045,9 @@ ACTOR Future>> readBlobGranulesActor( getValue(self->trState, blobWorkerListKeyFor(workerId), self->getReadVersion(), UseTenant::False))); // from the time the mapping was read from the db, the associated blob worker // could have died and so its interface wouldn't be present as part of the blobWorkerList - // we persist in the db. So throw wrong_shard_server to get the new mapping + // we persist in the db. So throw blob_granule_request_failed to get the new mapping if (!workerInterface.present()) { - // need to re-read mapping, throw transaction_too_old so client retries. TODO better error? - // throw wrong_shard_server(); - throw transaction_too_old(); + throw blob_granule_request_failed(); } // FIXME: maybe just want to insert here if there are racing queries for the same worker or something? self->trState->cx->blobWorker_interf[workerId] = decodeBlobWorkerListValue(workerInterface.get()); @@ -7927,12 +8074,30 @@ ACTOR Future>> readBlobGranulesActor( granuleEndKey = keyRange.end; } + if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.01)) { + // simulate as if we read a stale mapping and a different worker owns the granule + ASSERT(!self->trState->cx->blobWorker_interf.empty()); + CODE_PROBE(true, "Randomizing blob worker id for request"); + TraceEvent ev("RandomizingBlobWorkerForReq"); + ev.detail("OriginalWorker", workerId); + int randomIdx = deterministicRandom()->randomInt(0, self->trState->cx->blobWorker_interf.size()); + for (auto& it : self->trState->cx->blobWorker_interf) { + if (randomIdx == 0) { + workerId = it.first; + break; + } + randomIdx--; + } + ev.detail("NewWorker", workerId); + } + state BlobGranuleFileRequest req; req.keyRange = KeyRangeRef(StringRef(req.arena, granuleStartKey), StringRef(req.arena, granuleEndKey)); req.beginVersion = begin; req.readVersion = rv; req.tenantInfo = self->getTenant().present() ? self->trState->getTenantInfo() : TenantInfo(); req.canCollapseBegin = true; // TODO make this a parameter once we support it + req.summarize = summarize; std::vector>> v; v.push_back( @@ -8003,6 +8168,12 @@ ACTOR Future>> readBlobGranulesActor( chunkEndKey = chunkEndKey.removePrefix(tenantPrefix.get()); } keyRange = KeyRangeRef(std::min(chunkEndKey, keyRange.end), keyRange.end); + if (summarize && results.size() == chunkLimit) { + break; + } + } + if (summarize && results.size() == chunkLimit) { + break; } } // if we detect that this blob worker fails, cancel the request, as otherwise load balance will @@ -8028,10 +8199,8 @@ ACTOR Future>> readBlobGranulesActor( e.name()); } // worker is up but didn't actually have granule, or connection failed - if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed || - e.code() == error_code_unknown_tenant) { - // need to re-read mapping, throw transaction_too_old so client retries. TODO better error? - throw transaction_too_old(); + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed) { + throw blob_granule_request_failed(); } throw e; } @@ -8051,7 +8220,36 @@ Future>> Transaction::readBlobGranules Version begin, Optional readVersion, Version* readVersionOut) { - return readBlobGranulesActor(this, range, begin, readVersion, readVersionOut); + return readBlobGranulesActor( + this, range, begin, readVersion, readVersionOut, std::numeric_limits::max(), false); +} + +ACTOR Future>> summarizeBlobGranulesActor(Transaction* self, + KeyRange range, + Optional summaryVersion, + int rangeLimit) { + state Version readVersionOut; + Standalone> chunks = + wait(readBlobGranulesActor(self, range, 0, summaryVersion, &readVersionOut, rangeLimit, true)); + ASSERT(chunks.size() <= rangeLimit); + ASSERT(!summaryVersion.present() || readVersionOut == summaryVersion.get()); + Standalone> summaries; + summaries.reserve(summaries.arena(), chunks.size()); + for (auto& it : chunks) { + summaries.push_back(summaries.arena(), summarizeGranuleChunk(summaries.arena(), it)); + } + + return summaries; +} + +Future>> +Transaction::summarizeBlobGranules(const KeyRange& range, Optional summaryVersion, int rangeLimit) { + return summarizeBlobGranulesActor(this, range, summaryVersion, rangeLimit); +} + +void Transaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) { + trState->cx->bgReadInputBytes += stats.inputBytes; + trState->cx->bgReadOutputBytes += stats.outputBytes; } ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) { @@ -8077,11 +8275,18 @@ ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAw ACTOR Future checkBlobSubrange(Database db, KeyRange keyRange, Optional version) { state Transaction tr(db); - state Version readVersionOut = invalidVersion; loop { try { - wait(success(tr.readBlobGranules(keyRange, 0, version, &readVersionOut))); - return readVersionOut; + state Version summaryVersion; + if (version.present()) { + summaryVersion = version.get(); + } else { + wait(store(summaryVersion, tr.getReadVersion())); + } + // same properties as a read for validating granule is readable, just much less memory and network bandwidth + // used + wait(success(tr.summarizeBlobGranules(keyRange, summaryVersion, std::numeric_limits::max()))); + return summaryVersion; } catch (Error& e) { wait(tr.onError(e)); } @@ -8094,12 +8299,38 @@ ACTOR Future verifyBlobRangeActor(Reference cx, KeyRan state Standalone> allRanges; state KeyRange curRegion = KeyRangeRef(range.begin, range.begin); state Version readVersionOut = invalidVersion; - state int batchSize = CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2; + state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2; + state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize; + + if (version.present()) { + if (version.get() == latestVersion) { + loop { + try { + Version _version = wait(tr.getReadVersion()); + version = _version; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + if (version.get() <= 0) { + TraceEvent("VerifyBlobInvalidVersion").detail("Range", range).detail("Version", version); + throw unsupported_operation(); + } + } + loop { - try { - wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), 20 * batchSize))); - } catch (Error& e) { - wait(tr.onError(e)); + if (curRegion.begin >= range.end) { + return readVersionOut; + } + loop { + try { + wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize))); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } } if (allRanges.empty()) { @@ -8113,7 +8344,7 @@ ACTOR Future verifyBlobRangeActor(Reference cx, KeyRan // Chunk up to smaller ranges than this limit. Must be smaller than BG_TOO_MANY_GRANULES to not hit the limit int batchCount = 0; for (auto& it : allRanges) { - if (it.begin != curRegion.end) { + if (it.begin > curRegion.end) { return invalidVersion; } @@ -8130,7 +8361,15 @@ ACTOR Future verifyBlobRangeActor(Reference cx, KeyRan checkParts.push_back(checkBlobSubrange(db, curRegion, version)); } - wait(waitForAll(checkParts)); + try { + wait(waitForAll(checkParts)); + } catch (Error& e) { + if (e.code() == error_code_blob_granule_transaction_too_old) { + return invalidVersion; + } + throw e; + } + ASSERT(!checkParts.empty()); readVersionOut = checkParts.back().get(); curRegion = KeyRangeRef(curRegion.end, curRegion.end); } @@ -8367,7 +8606,7 @@ Reference Transaction::createTrLogInfoProbabilistically(cons cx->globalConfig->get(fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY); if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && - (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) { + (!g_network->isSimulated() || !g_simulator->speedUpSimulation)) { return makeReference(TransactionLogInfo::DATABASE); } } @@ -8688,16 +8927,13 @@ ACTOR static Future rebootWorkerActor(DatabaseContext* cx, ValueRef add for (const auto& it : kvs) { ClientWorkerInterface workerInterf = BinaryReader::fromStringRef(it.value, IncludeVersion()); - Key primaryAddress = - it.key.endsWith(LiteralStringRef(":tls")) ? it.key.removeSuffix(LiteralStringRef(":tls")) : it.key; + Key primaryAddress = it.key.endsWith(":tls"_sr) ? it.key.removeSuffix(":tls"_sr) : it.key; workerInterfaces[primaryAddress] = workerInterf; // Also add mapping from a worker's second address(if present) to its interface if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { Key secondAddress = StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - secondAddress = secondAddress.endsWith(LiteralStringRef(":tls")) - ? secondAddress.removeSuffix(LiteralStringRef(":tls")) - : secondAddress; + secondAddress = secondAddress.endsWith(":tls"_sr) ? secondAddress.removeSuffix(":tls"_sr) : secondAddress; workerInterfaces[secondAddress] = workerInterf; } } @@ -8775,32 +9011,22 @@ void DatabaseContext::setSharedState(DatabaseSharedState* p) { } ACTOR Future storageFeedVersionUpdater(StorageServerInterface interf, ChangeFeedStorageData* self) { - state Promise destroyed = self->destroyed; loop { - if (destroyed.isSet()) { - return Void(); - } if (self->version.get() < self->desired.get()) { wait(delay(CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) || self->version.whenAtLeast(self->desired.get())); - if (destroyed.isSet()) { - return Void(); - } if (self->version.get() < self->desired.get()) { try { ChangeFeedVersionUpdateReply rep = wait(brokenPromiseToNever( interf.changeFeedVersionUpdate.getReply(ChangeFeedVersionUpdateRequest(self->desired.get())))); - if (rep.version > self->version.get()) { self->version.set(rep.version); } } catch (Error& e) { - if (e.code() == error_code_server_overloaded) { - if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) { - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME)); - } - } else { - throw e; + if (e.code() != error_code_server_overloaded) { + throw; + } + if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) { + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME)); } } } @@ -8819,19 +9045,23 @@ Reference DatabaseContext::getStorageData(StorageServerIn newStorageUpdater->id = interf.id(); newStorageUpdater->interfToken = token; newStorageUpdater->updater = storageFeedVersionUpdater(interf, newStorageUpdater.getPtr()); - changeFeedUpdaters[token] = newStorageUpdater; + newStorageUpdater->context = this; + newStorageUpdater->created = now(); + changeFeedUpdaters[token] = newStorageUpdater.getPtr(); return newStorageUpdater; } - return it->second; + return Reference::addRef(it->second); } Version DatabaseContext::getMinimumChangeFeedVersion() { Version minVersion = std::numeric_limits::max(); for (auto& it : changeFeedUpdaters) { - minVersion = std::min(minVersion, it.second->version.get()); + if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) { + minVersion = std::min(minVersion, it.second->version.get()); + } } for (auto& it : notAtLatestChangeFeeds) { - if (it.second->getVersion() > 0) { + if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) { minVersion = std::min(minVersion, it.second->getVersion()); } } @@ -8846,8 +9076,14 @@ void DatabaseContext::setDesiredChangeFeedVersion(Version v) { } } +ChangeFeedStorageData::~ChangeFeedStorageData() { + if (context) { + context->changeFeedUpdaters.erase(interfToken); + } +} + ChangeFeedData::ChangeFeedData(DatabaseContext* context) - : dbgid(deterministicRandom()->randomUniqueID()), context(context), notAtLatest(1) { + : dbgid(deterministicRandom()->randomUniqueID()), context(context), notAtLatest(1), created(now()) { if (context) { context->notAtLatestChangeFeeds[dbgid] = this; } @@ -9242,11 +9478,6 @@ ACTOR Future mergeChangeFeedStream(Reference db, results->streams.push_back(it.first.changeFeedStream.getReplyStream(req)); } - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->maxSeenVersion = invalidVersion; results->storageData.clear(); Promise refresh = results->refresh; @@ -9257,6 +9488,7 @@ ACTOR Future mergeChangeFeedStream(Reference db, results->notAtLatest.set(interfs.size()); if (results->context) { results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); } refresh.send(Void()); @@ -9300,6 +9532,8 @@ ACTOR Future getChangeFeedRange(Reference db, Databas loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); Version readVer = wait(tr.getReadVersion()); if (readVer < begin) { wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); @@ -9451,11 +9685,6 @@ ACTOR Future singleChangeFeedStream(Reference db, results->streams.clear(); - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.push_back(interf.changeFeedStream.getReplyStream(req)); results->maxSeenVersion = invalidVersion; @@ -9466,6 +9695,7 @@ ACTOR Future singleChangeFeedStream(Reference db, results->notAtLatest.set(1); if (results->context) { results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); } refresh.send(Void()); @@ -9484,6 +9714,7 @@ ACTOR Future getChangeFeedStreamActor(Reference db, bool canReadPopped) { state Database cx(db); state Span span("NAPI:GetChangeFeedStream"_loc); + db->usedAnyChangeFeeds = true; results->endVersion = end; @@ -9535,6 +9766,10 @@ ACTOR Future getChangeFeedStreamActor(Reference db, if (useIdx >= 0) { chosenLocations[loc] = useIdx; loc++; + if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.01)) { + // simulate as if we had to wait for all alternatives delayed, before the next one + wait(delay(deterministicRandom()->random01())); + } continue; } @@ -9555,7 +9790,10 @@ ACTOR Future getChangeFeedStreamActor(Reference db, loc = 0; } + ++db->feedStreamStarts; + if (locations.size() > 1) { + ++db->feedMergeStreamStarts; std::vector> interfs; for (int i = 0; i < locations.size(); i++) { interfs.emplace_back(locations[i].locations->getInterface(chosenLocations[i]), @@ -9575,15 +9813,11 @@ ACTOR Future getChangeFeedStreamActor(Reference db, } } catch (Error& e) { if (e.code() == error_code_actor_cancelled || e.code() == error_code_change_feed_popped) { - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.clear(); results->storageData.clear(); if (e.code() == error_code_change_feed_popped) { - CODE_PROBE(true, "getChangeFeedStreamActor got popped"); + ++db->feedNonRetriableErrors; + CODE_PROBE(true, "getChangeFeedStreamActor got popped", probe::decoration::rare); results->mutations.sendError(e); results->refresh.sendError(e); } else { @@ -9595,30 +9829,38 @@ ACTOR Future getChangeFeedStreamActor(Reference db, results->notAtLatest.set(1); if (results->context) { results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); } } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_connection_failed || e.code() == error_code_unknown_change_feed || - e.code() == error_code_broken_promise) { + e.code() == error_code_broken_promise || e.code() == error_code_future_version || + e.code() == error_code_request_maybe_delivered || + e.code() == error_code_storage_too_many_feed_streams) { + ++db->feedErrors; db->changeFeedCache.erase(rangeID); cx->invalidateCache(Key(), keys); - if (begin == lastBeginVersion) { + if (begin == lastBeginVersion || e.code() == error_code_storage_too_many_feed_streams) { // We didn't read anything since the last failure before failing again. - // Do exponential backoff, up to 1 second - sleepWithBackoff = std::min(1.0, sleepWithBackoff * 1.5); + // Back off quickly and exponentially, up to 1 second + sleepWithBackoff = std::min(2.0, sleepWithBackoff * 5); + sleepWithBackoff = std::max(0.1, sleepWithBackoff); } else { sleepWithBackoff = CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY; } + TraceEvent("ChangeFeedClientError") + .errorUnsuppressed(e) + .suppressFor(30.0) + .detail("AnyProgress", begin != lastBeginVersion); wait(delay(sleepWithBackoff)); } else { + if (e.code() != error_code_end_of_stream) { + ++db->feedNonRetriableErrors; + TraceEvent("ChangeFeedClientErrorNonRetryable").errorUnsuppressed(e).suppressFor(5.0); + } results->mutations.sendError(e); results->refresh.sendError(change_feed_cancelled()); - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.clear(); results->storageData.clear(); return Void(); @@ -9727,7 +9969,8 @@ ACTOR Future getOverlappingChangeFeedsActor(Referenc } return result; } catch (Error& e) { - if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || + e.code() == error_code_future_version) { cx->invalidateCache(Key(), range); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); } else { @@ -9742,10 +9985,13 @@ Future DatabaseContext::getOverlappingChangeFeeds(Ke } ACTOR static Future popChangeFeedBackup(Database cx, Key rangeID, Version version) { + ++cx->feedPopsFallback; state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); Optional val = wait(tr.get(rangeIDKey)); if (val.present()) { @@ -9777,6 +10023,8 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke state Database cx(db); state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); state Span span("NAPI:PopChangeFeedMutations"_loc); + db->usedAnyChangeFeeds = true; + ++db->feedPops; state KeyRange keys = wait(getChangeFeedRange(db, cx, rangeID)); @@ -9861,10 +10109,29 @@ ACTOR Future purgeBlobGranulesActor(Reference db, state KeyRange purgeRange = range; state bool loadedTenantPrefix = false; + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + if (purgeVersion == latestVersion) { + loop { + try { + Version _purgeVersion = wait(tr.getReadVersion()); + purgeVersion = _purgeVersion; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + tr.reset(); + } + if (purgeVersion <= 0) { + TraceEvent("PurgeInvalidVersion").detail("Range", range).detail("Version", purgeVersion).detail("Force", force); + throw unsupported_operation(); + } + loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); if (tenant.present() && !loadedTenantPrefix) { TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin)); @@ -9872,6 +10139,18 @@ ACTOR Future purgeBlobGranulesActor(Reference db, purgeRange = purgeRange.withPrefix(tenantEntry.prefix); } + // must be aligned to blob range(s) + state Future> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin)); + state Future> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin)); + wait(success(beginPresent) && success(endPresent)); + if (!beginPresent.get().present() || !endPresent.get().present()) { + TraceEvent("UnalignedPurge") + .detail("Range", range) + .detail("Version", purgeVersion) + .detail("Force", force); + throw unsupported_operation(); + } + Value purgeValue = blobGranulePurgeValueFor(purgeVersion, range, force); tr.atomicOp( addVersionStampAtEnd(blobGranulePurgeKeys.begin), purgeValue, MutationRef::SetVersionstampedKey); @@ -9941,29 +10220,71 @@ Future DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) { return waitPurgeGranulesCompleteActor(Reference::addRef(this), purgeKey); } +ACTOR Future>> getBlobRanges(Reference tr, + KeyRange range, + int batchLimit) { + state Standalone> blobRanges; + state Key beginKey = range.begin; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state RangeResult results = wait( + krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2)); + + blobRanges.arena().dependsOn(results.arena()); + for (int i = 0; i < results.size() - 1; i++) { + if (results[i].value == blobRangeActive) { + blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key)); + } + if (blobRanges.size() == batchLimit) { + return blobRanges; + } + } + + if (!results.more) { + return blobRanges; + } + beginKey = results.back().key; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + ACTOR Future setBlobRangeActor(Reference cx, KeyRange range, bool active) { state Database db(cx); state Reference tr = makeReference(db); state Value value = active ? blobRangeActive : blobRangeInactive; - loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + Standalone> startBlobRanges = wait(getBlobRanges(tr, range, 1)); + if (active) { - state RangeResult results = wait(krmGetRanges(tr, blobRangeKeys.begin, range)); - ASSERT(results.size() >= 2); - if (results[0].key == range.begin && results[1].key == range.end && - results[0].value == blobRangeActive) { + // Idempotent request. + if (!startBlobRanges.empty()) { + return startBlobRanges.front().begin == range.begin && startBlobRanges.front().end == range.end; + } + } else { + // An unblobbify request must be aligned to boundaries. + // It is okay to unblobbify multiple regions all at once. + if (startBlobRanges.empty()) { + // already unblobbified return true; - } else { - for (int i = 0; i < results.size(); i++) { - if (results[i].value == blobRangeActive) { - return false; - } - } + } else if (startBlobRanges.front().begin != range.begin) { + // If there is a blob at the beginning of the range and it isn't aligned + return false; + } + // if blob range does start at the specified, key, we need to make sure the end of also a boundary of a + // blob range + Optional endPresent = wait(tr->get(range.end.withPrefix(blobRangeKeys.begin))); + if (!endPresent.present()) { + return false; } } @@ -9971,10 +10292,6 @@ ACTOR Future setBlobRangeActor(Reference cx, KeyRange ran // This is not coalescing because we want to keep each range logically separate. wait(krmSetRange(tr, blobRangeKeys.begin, range, value)); wait(tr->commit()); - printf("Successfully updated blob range [%s - %s) to %s\n", - range.begin.printable().c_str(), - range.end.printable().c_str(), - value.printable().c_str()); return true; } catch (Error& e) { wait(tr->onError(e)); @@ -9995,29 +10312,10 @@ ACTOR Future>> listBlobbifiedRangesActor(Refer int rangeLimit) { state Database db(cx); state Reference tr = makeReference(db); - state Standalone> blobRanges; - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Standalone> blobRanges = wait(getBlobRanges(tr, range, rangeLimit)); - state RangeResult results = wait(krmGetRanges(tr, blobRangeKeys.begin, range, 2 * rangeLimit + 2)); - - blobRanges.arena().dependsOn(results.arena()); - for (int i = 0; i < results.size() - 1; i++) { - if (results[i].value == LiteralStringRef("1")) { - blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].value, results[i + 1].value)); - } - if (blobRanges.size() == rangeLimit) { - return blobRanges; - } - } - - return blobRanges; - } catch (Error& e) { - wait(tr->onError(e)); - } - } + return blobRanges; } Future>> DatabaseContext::listBlobbifiedRanges(KeyRange range, int rowLimit) { diff --git a/fdbclient/PaxosConfigTransaction.actor.cpp b/fdbclient/PaxosConfigTransaction.actor.cpp index 3e69df7227..b940aced7f 100644 --- a/fdbclient/PaxosConfigTransaction.actor.cpp +++ b/fdbclient/PaxosConfigTransaction.actor.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/DatabaseContext.h" +#include "fdbclient/MonitorLeader.h" #include "fdbclient/PaxosConfigTransaction.h" #include "flow/actorcompiler.h" // must be last include @@ -34,8 +35,9 @@ class CommitQuorum { Standalone> mutations; ConfigCommitAnnotation annotation; - ConfigTransactionCommitRequest getCommitRequest(ConfigGeneration generation) const { - return ConfigTransactionCommitRequest(generation, mutations, annotation); + ConfigTransactionCommitRequest getCommitRequest(ConfigGeneration generation, + CoordinatorsHash coordinatorsHash) const { + return ConfigTransactionCommitRequest(coordinatorsHash, generation, mutations, annotation); } void updateResult() { @@ -62,14 +64,16 @@ class CommitQuorum { ACTOR static Future addRequestActor(CommitQuorum* self, ConfigGeneration generation, + CoordinatorsHash coordinatorsHash, ConfigTransactionInterface cti) { try { if (cti.hostname.present()) { - wait(timeoutError(retryGetReplyFromHostname( - self->getCommitRequest(generation), cti.hostname.get(), WLTOKEN_CONFIGTXN_COMMIT), + wait(timeoutError(retryGetReplyFromHostname(self->getCommitRequest(generation, coordinatorsHash), + cti.hostname.get(), + WLTOKEN_CONFIGTXN_COMMIT), CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT)); } else { - wait(timeoutError(cti.commit.getReply(self->getCommitRequest(generation)), + wait(timeoutError(cti.commit.getReply(self->getCommitRequest(generation, coordinatorsHash)), CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT)); } ++self->successful; @@ -109,11 +113,11 @@ public: } void setTimestamp() { annotation.timestamp = now(); } size_t expectedSize() const { return annotation.expectedSize() + mutations.expectedSize(); } - Future commit(ConfigGeneration generation) { + Future commit(ConfigGeneration generation, CoordinatorsHash coordinatorsHash) { // Send commit message to all replicas, even those that did not return the used replica. // This way, slow replicas are kept up date. for (const auto& cti : ctis) { - actors.add(addRequestActor(this, generation, cti)); + actors.add(addRequestActor(this, generation, coordinatorsHash, cti)); } return result.getFuture(); } @@ -122,11 +126,13 @@ public: class GetGenerationQuorum { ActorCollection actors{ false }; + CoordinatorsHash coordinatorsHash{ 0 }; std::vector ctis; std::map> seenGenerations; Promise result; size_t totalRepliesReceived{ 0 }; size_t maxAgreement{ 0 }; + Future coordinatorsChangedFuture; Optional lastSeenLiveVersion; Future getGenerationFuture; @@ -137,14 +143,15 @@ class GetGenerationQuorum { if (cti.hostname.present()) { wait(timeoutError(store(reply, retryGetReplyFromHostname( - ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion }, + ConfigTransactionGetGenerationRequest{ self->coordinatorsHash, + self->lastSeenLiveVersion }, cti.hostname.get(), WLTOKEN_CONFIGTXN_GETGENERATION)), CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT)); } else { wait(timeoutError(store(reply, - cti.getGeneration.getReply( - ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion })), + cti.getGeneration.getReply(ConfigTransactionGetGenerationRequest{ + self->coordinatorsHash, self->lastSeenLiveVersion })), CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT)); } @@ -155,6 +162,14 @@ class GetGenerationQuorum { auto& replicas = self->seenGenerations[gen]; replicas.push_back(cti); self->maxAgreement = std::max(replicas.size(), self->maxAgreement); + // TraceEvent("ConfigTransactionGotGenerationReply") + // .detail("From", cti.getGeneration.getEndpoint().getPrimaryAddress()) + // .detail("TotalRepliesReceived", self->totalRepliesReceived) + // .detail("ReplyGeneration", gen.toString()) + // .detail("Replicas", replicas.size()) + // .detail("Coordinators", self->ctis.size()) + // .detail("MaxAgreement", self->maxAgreement) + // .detail("LastSeenLiveVersion", self->lastSeenLiveVersion); if (replicas.size() >= self->ctis.size() / 2 + 1 && !self->result.isSet()) { self->result.send(gen); } else if (self->maxAgreement + (self->ctis.size() - self->totalRepliesReceived) < @@ -200,8 +215,18 @@ class GetGenerationQuorum { } catch (Error& e) { if (e.code() == error_code_failed_to_reach_quorum) { CODE_PROBE(true, "Failed to reach quorum getting generation"); - wait(delayJittered( - std::clamp(0.005 * (1 << retries), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND))); + if (self->coordinatorsChangedFuture.isReady()) { + throw coordinators_changed(); + } + wait(delayJittered(std::clamp( + 0.005 * (1 << std::min(retries, 30)), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND))); + if (deterministicRandom()->random01() < 0.05) { + // Randomly inject a delay of at least the generation + // reply timeout, to try to prevent contention between + // clients. + wait(delay(CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT * + (deterministicRandom()->random01() + 1.0))); + } ++retries; self->actors.clear(false); self->seenGenerations.clear(); @@ -217,9 +242,12 @@ class GetGenerationQuorum { public: GetGenerationQuorum() = default; - explicit GetGenerationQuorum(std::vector const& ctis, + explicit GetGenerationQuorum(CoordinatorsHash coordinatorsHash, + std::vector const& ctis, + Future coordinatorsChangedFuture, Optional const& lastSeenLiveVersion = {}) - : ctis(ctis), lastSeenLiveVersion(lastSeenLiveVersion) {} + : coordinatorsHash(coordinatorsHash), ctis(ctis), coordinatorsChangedFuture(coordinatorsChangedFuture), + lastSeenLiveVersion(lastSeenLiveVersion) {} Future getGeneration() { if (!getGenerationFuture.isValid()) { getGenerationFuture = getGenerationActor(this); @@ -240,12 +268,14 @@ public: }; class PaxosConfigTransactionImpl { + CoordinatorsHash coordinatorsHash{ 0 }; std::vector ctis; GetGenerationQuorum getGenerationQuorum; CommitQuorum commitQuorum; int numRetries{ 0 }; Optional dID; Database cx; + Future watchClusterFileFuture; ACTOR static Future> get(PaxosConfigTransactionImpl* self, Key key) { state ConfigKey configKey = ConfigKey::decodeKey(key); @@ -263,18 +293,19 @@ class PaxosConfigTransactionImpl { } wait(waitForAll(fs)); state Reference configNodes(new ConfigTransactionInfo(readReplicas)); - ConfigTransactionGetReply reply = - wait(timeoutError(basicLoadBalance(configNodes, - &ConfigTransactionInterface::get, - ConfigTransactionGetRequest{ generation, configKey }), - CLIENT_KNOBS->GET_KNOB_TIMEOUT)); + ConfigTransactionGetReply reply = wait(timeoutError( + basicLoadBalance(configNodes, + &ConfigTransactionInterface::get, + ConfigTransactionGetRequest{ self->coordinatorsHash, generation, configKey }), + CLIENT_KNOBS->GET_KNOB_TIMEOUT)); if (reply.value.present()) { return reply.value.get().toValue(); } else { return Optional{}; } } catch (Error& e) { - if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) { + if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise && + e.code() != error_code_coordinators_changed) { throw; } self->reset(); @@ -283,58 +314,87 @@ class PaxosConfigTransactionImpl { } ACTOR static Future getConfigClasses(PaxosConfigTransactionImpl* self) { - state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - state std::vector readReplicas = self->getGenerationQuorum.getReadReplicas(); - std::vector> fs; - for (ConfigTransactionInterface& readReplica : readReplicas) { - if (readReplica.hostname.present()) { - fs.push_back(tryInitializeRequestStream( - &readReplica.getClasses, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES)); + loop { + try { + state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); + state std::vector readReplicas = + self->getGenerationQuorum.getReadReplicas(); + std::vector> fs; + for (ConfigTransactionInterface& readReplica : readReplicas) { + if (readReplica.hostname.present()) { + fs.push_back(tryInitializeRequestStream( + &readReplica.getClasses, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES)); + } + } + wait(waitForAll(fs)); + state Reference configNodes(new ConfigTransactionInfo(readReplicas)); + ConfigTransactionGetConfigClassesReply reply = wait( + basicLoadBalance(configNodes, + &ConfigTransactionInterface::getClasses, + ConfigTransactionGetConfigClassesRequest{ self->coordinatorsHash, generation })); + RangeResult result; + result.reserve(result.arena(), reply.configClasses.size()); + for (const auto& configClass : reply.configClasses) { + result.push_back_deep(result.arena(), KeyValueRef(configClass, ""_sr)); + } + return result; + } catch (Error& e) { + if (e.code() != error_code_coordinators_changed) { + throw; + } + self->reset(); } } - wait(waitForAll(fs)); - state Reference configNodes(new ConfigTransactionInfo(readReplicas)); - ConfigTransactionGetConfigClassesReply reply = - wait(basicLoadBalance(configNodes, - &ConfigTransactionInterface::getClasses, - ConfigTransactionGetConfigClassesRequest{ generation })); - RangeResult result; - result.reserve(result.arena(), reply.configClasses.size()); - for (const auto& configClass : reply.configClasses) { - result.push_back_deep(result.arena(), KeyValueRef(configClass, ""_sr)); - } - return result; } ACTOR static Future getKnobs(PaxosConfigTransactionImpl* self, Optional configClass) { - state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - state std::vector readReplicas = self->getGenerationQuorum.getReadReplicas(); - std::vector> fs; - for (ConfigTransactionInterface& readReplica : readReplicas) { - if (readReplica.hostname.present()) { - fs.push_back(tryInitializeRequestStream( - &readReplica.getKnobs, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS)); + loop { + try { + state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); + state std::vector readReplicas = + self->getGenerationQuorum.getReadReplicas(); + std::vector> fs; + for (ConfigTransactionInterface& readReplica : readReplicas) { + if (readReplica.hostname.present()) { + fs.push_back(tryInitializeRequestStream( + &readReplica.getKnobs, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS)); + } + } + wait(waitForAll(fs)); + state Reference configNodes(new ConfigTransactionInfo(readReplicas)); + ConfigTransactionGetKnobsReply reply = wait(basicLoadBalance( + configNodes, + &ConfigTransactionInterface::getKnobs, + ConfigTransactionGetKnobsRequest{ self->coordinatorsHash, generation, configClass })); + RangeResult result; + result.reserve(result.arena(), reply.knobNames.size()); + for (const auto& knobName : reply.knobNames) { + result.push_back_deep(result.arena(), KeyValueRef(knobName, ""_sr)); + } + return result; + } catch (Error& e) { + if (e.code() != error_code_coordinators_changed) { + throw; + } + self->reset(); } } - wait(waitForAll(fs)); - state Reference configNodes(new ConfigTransactionInfo(readReplicas)); - ConfigTransactionGetKnobsReply reply = - wait(basicLoadBalance(configNodes, - &ConfigTransactionInterface::getKnobs, - ConfigTransactionGetKnobsRequest{ generation, configClass })); - RangeResult result; - result.reserve(result.arena(), reply.knobNames.size()); - for (const auto& knobName : reply.knobNames) { - result.push_back_deep(result.arena(), KeyValueRef(knobName, ""_sr)); - } - return result; } ACTOR static Future commit(PaxosConfigTransactionImpl* self) { - ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - self->commitQuorum.setTimestamp(); - wait(self->commitQuorum.commit(generation)); - return Void(); + loop { + try { + ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); + self->commitQuorum.setTimestamp(); + wait(self->commitQuorum.commit(generation, self->coordinatorsHash)); + return Void(); + } catch (Error& e) { + if (e.code() != error_code_coordinators_changed) { + throw; + } + self->reset(); + } + } } ACTOR static Future onError(PaxosConfigTransactionImpl* self, Error e) { @@ -350,6 +410,20 @@ class PaxosConfigTransactionImpl { throw e; } + // Returns when the cluster interface updates with a new connection string. + ACTOR static Future watchClusterFile(Database cx) { + state Future leaderMonitor = + monitorLeader(cx->getConnectionRecord(), cx->statusClusterInterface); + state std::string connectionString = cx->getConnectionRecord()->getConnectionString().toString(); + + loop { + wait(cx->statusClusterInterface->onChange()); + if (cx->getConnectionRecord()->getConnectionString().toString() != connectionString) { + return Void(); + } + } + } + public: Future getReadVersion() { return map(getGenerationQuorum.getGeneration(), [](auto const& gen) { return gen.committedVersion; }); @@ -395,7 +469,25 @@ public: void debugTransaction(UID dID) { this->dID = dID; } void reset() { - getGenerationQuorum = GetGenerationQuorum{ ctis }; + ctis.clear(); + // Re-read connection string. If the cluster file changed, this will + // return the updated value. + const ClusterConnectionString& cs = cx->getConnectionRecord()->getConnectionString(); + ctis.reserve(cs.hostnames.size() + cs.coords.size()); + for (const auto& h : cs.hostnames) { + ctis.emplace_back(h); + } + for (const auto& c : cs.coords) { + ctis.emplace_back(c); + } + coordinatorsHash = std::hash()(cx->getConnectionRecord()->getConnectionString().toString()); + if (!cx->statusLeaderMon.isValid() || cx->statusLeaderMon.isReady()) { + cx->statusClusterInterface = makeReference>>(); + cx->statusLeaderMon = watchClusterFile(cx); + } + getGenerationQuorum = GetGenerationQuorum{ + coordinatorsHash, ctis, cx->statusLeaderMon, getGenerationQuorum.getLastSeenLiveVersion() + }; commitQuorum = CommitQuorum{ ctis }; } @@ -416,21 +508,10 @@ public: Future commit() { return commit(this); } - PaxosConfigTransactionImpl(Database const& cx) : cx(cx) { - const ClusterConnectionString& cs = cx->getConnectionRecord()->getConnectionString(); - ctis.reserve(cs.hostnames.size() + cs.coords.size()); - for (const auto& h : cs.hostnames) { - ctis.emplace_back(h); - } - for (const auto& c : cs.coords) { - ctis.emplace_back(c); - } - getGenerationQuorum = GetGenerationQuorum{ ctis }; - commitQuorum = CommitQuorum{ ctis }; - } + PaxosConfigTransactionImpl(Database const& cx) : cx(cx) { reset(); } PaxosConfigTransactionImpl(std::vector const& ctis) - : ctis(ctis), getGenerationQuorum(ctis), commitQuorum(ctis) {} + : ctis(ctis), getGenerationQuorum(0, ctis, Future()), commitQuorum(ctis) {} }; Future PaxosConfigTransaction::getReadVersion() { diff --git a/fdbclient/RYWIterator.cpp b/fdbclient/RYWIterator.cpp index 949f164485..3966df3748 100644 --- a/fdbclient/RYWIterator.cpp +++ b/fdbclient/RYWIterator.cpp @@ -231,28 +231,28 @@ void testSnapshotCache() { WriteMap writes(&arena); Standalone> keys; - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("d"), LiteralStringRef("doo"))); - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("e"), LiteralStringRef("eoo"))); - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("e\x00"), LiteralStringRef("zoo"))); - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("f"), LiteralStringRef("foo"))); - cache.insert(KeyRangeRef(LiteralStringRef("d"), LiteralStringRef("f\x00")), keys); + keys.push_back_deep(keys.arena(), KeyValueRef("d"_sr, "doo"_sr)); + keys.push_back_deep(keys.arena(), KeyValueRef("e"_sr, "eoo"_sr)); + keys.push_back_deep(keys.arena(), KeyValueRef("e\x00"_sr, "zoo"_sr)); + keys.push_back_deep(keys.arena(), KeyValueRef("f"_sr, "foo"_sr)); + cache.insert(KeyRangeRef("d"_sr, "f\x00"_sr), keys); - cache.insert(KeyRangeRef(LiteralStringRef("g"), LiteralStringRef("h")), Standalone>()); + cache.insert(KeyRangeRef("g"_sr, "h"_sr), Standalone>()); Standalone> keys2; - keys2.push_back_deep(keys2.arena(), KeyValueRef(LiteralStringRef("k"), LiteralStringRef("koo"))); - keys2.push_back_deep(keys2.arena(), KeyValueRef(LiteralStringRef("l"), LiteralStringRef("loo"))); - cache.insert(KeyRangeRef(LiteralStringRef("j"), LiteralStringRef("m")), keys2); + keys2.push_back_deep(keys2.arena(), KeyValueRef("k"_sr, "koo"_sr)); + keys2.push_back_deep(keys2.arena(), KeyValueRef("l"_sr, "loo"_sr)); + cache.insert(KeyRangeRef("j"_sr, "m"_sr), keys2); - writes.mutate(LiteralStringRef("c"), MutationRef::SetValue, LiteralStringRef("c--"), true); - writes.clear(KeyRangeRef(LiteralStringRef("c\x00"), LiteralStringRef("e")), true); - writes.mutate(LiteralStringRef("c\x00"), MutationRef::SetValue, LiteralStringRef("c00--"), true); + writes.mutate("c"_sr, MutationRef::SetValue, "c--"_sr, true); + writes.clear(KeyRangeRef("c\x00"_sr, "e"_sr), true); + writes.mutate("c\x00"_sr, MutationRef::SetValue, "c00--"_sr, true); WriteMap::iterator it3(&writes); - writes.mutate(LiteralStringRef("d"), MutationRef::SetValue, LiteralStringRef("d--"), true); - writes.mutate(LiteralStringRef("e"), MutationRef::SetValue, LiteralStringRef("e++"), true); - writes.mutate(LiteralStringRef("i"), MutationRef::SetValue, LiteralStringRef("i--"), true); + writes.mutate("d"_sr, MutationRef::SetValue, "d--"_sr, true); + writes.mutate("e"_sr, MutationRef::SetValue, "e++"_sr, true); + writes.mutate("i"_sr, MutationRef::SetValue, "i--"_sr, true); - KeyRange searchKeys = KeyRangeRef(LiteralStringRef("a"), LiteralStringRef("z")); + KeyRange searchKeys = KeyRangeRef("a"_sr, "z"_sr); RYWIterator it(&cache, &writes); it.skip(searchKeys.begin); @@ -425,7 +425,7 @@ TEST_CASE("/fdbclient/WriteMap/emptiness") { Arena arena = Arena(); WriteMap writes = WriteMap(&arena); ASSERT(writes.empty()); - writes.mutate(LiteralStringRef("apple"), MutationRef::SetValue, LiteralStringRef("red"), true); + writes.mutate("apple"_sr, MutationRef::SetValue, "red"_sr, true); ASSERT(!writes.empty()); return Void(); } @@ -457,11 +457,11 @@ TEST_CASE("/fdbclient/WriteMap/clear") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("apple"), MutationRef::SetValue, LiteralStringRef("red"), true); + writes.mutate("apple"_sr, MutationRef::SetValue, "red"_sr, true); ASSERT(!writes.empty()); ASSERT(getWriteMapCount(&writes) == 3); - KeyRangeRef range = KeyRangeRef(LiteralStringRef("a"), LiteralStringRef("j")); + KeyRangeRef range = KeyRangeRef("a"_sr, "j"_sr); writes.clear(range, true); ASSERT(getWriteMapCount(&writes) == 3); @@ -474,22 +474,19 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00"), - MutationRef::SetVersionstampedKey, - LiteralStringRef("1"), - true); + writes.mutate("stamp:XXXXXXXX\x06\x00\x00\x00"_sr, MutationRef::SetVersionstampedKey, "1"_sr, true); ASSERT(!writes.empty()); ASSERT(getWriteMapCount(&writes) == 3); - writes.mutate(LiteralStringRef("stamp:ZZZZZZZZZZ"), MutationRef::AddValue, LiteralStringRef("2"), true); + writes.mutate("stamp:ZZZZZZZZZZ"_sr, MutationRef::AddValue, "2"_sr, true); ASSERT(getWriteMapCount(&writes) == 5); WriteMap::iterator it(&writes); it.skip(allKeys.begin); ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0); + ASSERT(it.beginKey().compare(""_sr) == 0); + ASSERT(it.endKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -498,8 +495,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0); + ASSERT(it.beginKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00"_sr) == 0); + ASSERT(it.endKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -509,8 +506,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0); + ASSERT(it.beginKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00\x00"_sr) == 0); + ASSERT(it.endKey().compare("stamp:ZZZZZZZZZZ"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -519,8 +516,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0); + ASSERT(it.beginKey().compare("stamp:ZZZZZZZZZZ"_sr) == 0); + ASSERT(it.endKey().compare("stamp:ZZZZZZZZZZ\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -530,8 +527,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0); + ASSERT(it.beginKey().compare("stamp:ZZZZZZZZZZ\x00"_sr) == 0); + ASSERT(it.endKey().compare("\xff\xff"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -550,22 +547,19 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("stamp"), - MutationRef::SetVersionstampedValue, - LiteralStringRef("XXXXXXXX\x00\x00\x00\x00\x00\x00"), - true); + writes.mutate("stamp"_sr, MutationRef::SetVersionstampedValue, "XXXXXXXX\x00\x00\x00\x00\x00\x00"_sr, true); ASSERT(!writes.empty()); ASSERT(getWriteMapCount(&writes) == 3); - writes.mutate(LiteralStringRef("stamp123"), MutationRef::AddValue, LiteralStringRef("1"), true); + writes.mutate("stamp123"_sr, MutationRef::AddValue, "1"_sr, true); ASSERT(getWriteMapCount(&writes) == 5); WriteMap::iterator it(&writes); it.skip(allKeys.begin); ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp")) == 0); + ASSERT(it.beginKey().compare(""_sr) == 0); + ASSERT(it.endKey().compare("stamp"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -574,8 +568,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp\x00")) == 0); + ASSERT(it.beginKey().compare("stamp"_sr) == 0); + ASSERT(it.endKey().compare("stamp\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -585,8 +579,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp123")) == 0); + ASSERT(it.beginKey().compare("stamp\x00"_sr) == 0); + ASSERT(it.endKey().compare("stamp123"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -595,8 +589,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp123")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp123\x00")) == 0); + ASSERT(it.beginKey().compare("stamp123"_sr) == 0); + ASSERT(it.endKey().compare("stamp123\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -606,8 +600,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp123\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0); + ASSERT(it.beginKey().compare("stamp123\x00"_sr) == 0); + ASSERT(it.endKey().compare("\xff\xff"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -626,10 +620,10 @@ TEST_CASE("/fdbclient/WriteMap/addValue") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("apple123"), MutationRef::SetValue, LiteralStringRef("17"), true); + writes.mutate("apple123"_sr, MutationRef::SetValue, "17"_sr, true); ASSERT(getWriteMapCount(&writes) == 3); - writes.mutate(LiteralStringRef("apple123"), MutationRef::AddValue, LiteralStringRef("1"), true); + writes.mutate("apple123"_sr, MutationRef::AddValue, "1"_sr, true); ASSERT(getWriteMapCount(&writes) == 3); return Void(); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index e3772dfc39..965a9b59ca 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -459,7 +459,7 @@ public: if (!it.is_unreadable() && !it.is_unknown_range() && key.offset > 1) { *readThroughEnd = true; - key.setKey(maxKey); // maxKey is a KeyRef, but points to a LiteralStringRef. TODO: how can we ASSERT this? + key.setKey(maxKey); // maxKey is a KeyRef, but points to a literal. TODO: how can we ASSERT this? key.offset = 1; return; } @@ -681,7 +681,8 @@ public: break; if (it.is_unknown_range()) { - if (limits.hasByteLimit() && result.size() && itemsPastEnd >= 1 - end.offset) { + if (limits.hasByteLimit() && limits.hasSatisfiedMinRows() && result.size() && + itemsPastEnd >= 1 - end.offset) { result.more = true; break; } @@ -1213,7 +1214,7 @@ public: // isolation support. But it is not default and is rarely used. So we disallow it until we have thorough test // coverage for it.) if (snapshot) { - CODE_PROBE(true, "getMappedRange not supported for snapshot."); + CODE_PROBE(true, "getMappedRange not supported for snapshot.", probe::decoration::rare); throw unsupported_operation(); } // For now, getMappedRange requires read-your-writes being NOT disabled. But the support of RYW is limited @@ -1222,7 +1223,7 @@ public: // which returns the written value transparently. In another word, it makes sure not break RYW semantics without // actually implementing reading from the writes. if (ryw->options.readYourWritesDisabled) { - CODE_PROBE(true, "getMappedRange not supported for read-your-writes disabled."); + CODE_PROBE(true, "getMappedRange not supported for read-your-writes disabled.", probe::decoration::rare); throw unsupported_operation(); } @@ -1330,6 +1331,11 @@ public: ACTOR static void simulateTimeoutInFlightCommit(ReadYourWritesTransaction* ryw_) { state Reference ryw = Reference::addRef(ryw_); ASSERT(ryw->options.timeoutInSeconds > 0); + // An actual in-flight commit (i.e. one that's past the point where cancelling the transaction would stop it) + // would already have a read version. We need to get a read version too, otherwise committing a conflicting + // transaction may not ensure this transaction is no longer in-flight, since this transaction could get a read + // version _after_. + wait(success(ryw->getReadVersion())); if (!ryw->resetPromise.isSet()) ryw->resetPromise.sendError(transaction_timed_out()); wait(delay(deterministicRandom()->random01() * 5)); @@ -1543,7 +1549,7 @@ Future> ReadYourWritesTransaction::get(const Key& key, Snapshot return getDatabase()->specialKeySpace->get(this, key); } } else { - if (key == LiteralStringRef("\xff\xff/status/json")) { + if (key == "\xff\xff/status/json"_sr) { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { ++tr.getDatabase()->transactionStatusRequests; return getJSON(tr.getDatabase()); @@ -1552,7 +1558,7 @@ Future> ReadYourWritesTransaction::get(const Key& key, Snapshot } } - if (key == LiteralStringRef("\xff\xff/cluster_file_path")) { + if (key == "\xff\xff/cluster_file_path"_sr) { try { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { Optional output = StringRef(tr.getDatabase()->getConnectionRecord()->getLocation()); @@ -1564,7 +1570,7 @@ Future> ReadYourWritesTransaction::get(const Key& key, Snapshot return Optional(); } - if (key == LiteralStringRef("\xff\xff/connection_string")) { + if (key == "\xff\xff/connection_string"_sr) { try { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { Reference f = tr.getDatabase()->getConnectionRecord(); @@ -1626,7 +1632,7 @@ Future ReadYourWritesTransaction::getRange(KeySelector begin, return getDatabase()->specialKeySpace->getRange(this, begin, end, limits, reverse); } } else { - if (begin.getKey() == LiteralStringRef("\xff\xff/worker_interfaces")) { + if (begin.getKey() == "\xff\xff/worker_interfaces"_sr) { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { return getWorkerInterfaces(tr.getDatabase()->getConnectionRecord()); } else { @@ -1648,7 +1654,7 @@ Future ReadYourWritesTransaction::getRange(KeySelector begin, // This optimization prevents nullptr operations from being added to the conflict range if (limits.isReached()) { - CODE_PROBE(true, "RYW range read limit 0"); + CODE_PROBE(true, "RYW range read limit 0", probe::decoration::rare); return RangeResult(); } @@ -1662,7 +1668,7 @@ Future ReadYourWritesTransaction::getRange(KeySelector begin, end.removeOrEqual(end.arena()); if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) { - CODE_PROBE(true, "RYW range inverted"); + CODE_PROBE(true, "RYW range inverted", probe::decoration::rare); return RangeResult(); } @@ -1696,7 +1702,7 @@ Future ReadYourWritesTransaction::getMappedRange(KeySelector throw client_invalid_operation(); // Not support special keys. } } else { - if (begin.getKey() == LiteralStringRef("\xff\xff/worker_interfaces")) { + if (begin.getKey() == "\xff\xff/worker_interfaces"_sr) { throw client_invalid_operation(); // Not support special keys. } } @@ -1825,6 +1831,32 @@ Future>> ReadYourWritesTransaction::re return waitOrError(tr.readBlobGranules(range, begin, readVersion, readVersionOut), resetPromise.getFuture()); } +Future>> ReadYourWritesTransaction::summarizeBlobGranules( + const KeyRange& range, + Optional summaryVersion, + int rangeLimit) { + + if (checkUsedDuringCommit()) { + return used_during_commit(); + } + + if (resetPromise.isSet()) + return resetPromise.getFuture().getError(); + + KeyRef maxKey = getMaxReadKey(); + if (range.begin > maxKey || range.end > maxKey) + return key_outside_legal_range(); + + return waitOrError(tr.summarizeBlobGranules(range, summaryVersion, rangeLimit), resetPromise.getFuture()); +} + +void ReadYourWritesTransaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) { + if (checkUsedDuringCommit()) { + throw used_during_commit(); + } + tr.addGranuleMaterializeStats(stats); +} + void ReadYourWritesTransaction::addReadConflictRange(KeyRangeRef const& keys) { if (checkUsedDuringCommit()) { throw used_during_commit(); @@ -2016,22 +2048,20 @@ RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRange if (kr.begin <= iter->begin() && iter->begin() < kr.end) { result.push_back(result.arena(), KeyValueRef(iter->begin().withPrefix(readConflictRangeKeysRange.begin, result.arena()), - iter->value() ? LiteralStringRef("1") : LiteralStringRef("0"))); + iter->value() ? "1"_sr : "0"_sr)); } } } else { - CoalescedKeyRefRangeMap readConflicts{ LiteralStringRef("0"), specialKeys.end }; + CoalescedKeyRefRangeMap readConflicts{ "0"_sr, specialKeys.end }; for (const auto& range : tr.readConflictRanges()) - readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), "1"_sr); for (const auto& range : nativeReadRanges) - readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), "1"_sr); for (const auto& f : tr.getExtraReadConflictRanges()) { if (f.isReady() && f.get().first < f.get().second) readConflicts.insert(KeyRangeRef(f.get().first, f.get().second) .withPrefix(readConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + "1"_sr); } auto beginIter = readConflicts.rangeContaining(kr.begin); if (beginIter->begin() != kr.begin) @@ -2049,7 +2079,7 @@ RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRang RangeResult result; // Memory owned by result - CoalescedKeyRefRangeMap writeConflicts{ LiteralStringRef("0"), specialKeys.end }; + CoalescedKeyRefRangeMap writeConflicts{ "0"_sr, specialKeys.end }; if (!options.readYourWritesDisabled) { KeyRangeRef strippedWriteRangePrefix = kr.removePrefix(writeConflictRangeKeysRange.begin); @@ -2062,15 +2092,13 @@ RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRang writeConflicts.insert( KeyRangeRef(it.beginKey().toArena(result.arena()), it.endKey().toArena(result.arena())) .withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + "1"_sr); } } else { for (const auto& range : tr.writeConflictRanges()) - writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), "1"_sr); for (const auto& range : nativeWriteRanges) - writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), "1"_sr); } for (const auto& k : versionStampKeys) { @@ -2090,8 +2118,7 @@ RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRang } else { range = getVersionstampKeyRange(result.arena(), k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey()); } - writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), "1"_sr); } auto beginIter = writeConflicts.rangeContaining(kr.begin); @@ -2137,13 +2164,13 @@ void ReadYourWritesTransaction::atomicOp(const KeyRef& key, const ValueRef& oper KeyRef k; if (!tr.apiVersionAtLeast(520) && operationType == MutationRef::SetVersionstampedKey) { - k = key.withSuffix(LiteralStringRef("\x00\x00"), arena); + k = key.withSuffix("\x00\x00"_sr, arena); } else { k = KeyRef(arena, key); } ValueRef v; if (!tr.apiVersionAtLeast(520) && operationType == MutationRef::SetVersionstampedValue) { - v = operand.withSuffix(LiteralStringRef("\x00\x00\x00\x00"), arena); + v = operand.withSuffix("\x00\x00\x00\x00"_sr, arena); } else { v = ValueRef(arena, operand); } @@ -2195,17 +2222,17 @@ void ReadYourWritesTransaction::set(const KeyRef& key, const ValueRef& value) { } else { // These three special keys are deprecated in 7.0 and an alternative C API is added // TODO : Rewrite related code using C api - if (key == LiteralStringRef("\xff\xff/reboot_worker")) { + if (key == "\xff\xff/reboot_worker"_sr) { BinaryReader::fromStringRef(value, IncludeVersion()) .reboot.send(RebootRequest()); return; } - if (key == LiteralStringRef("\xff\xff/suspend_worker")) { + if (key == "\xff\xff/suspend_worker"_sr) { BinaryReader::fromStringRef(value, IncludeVersion()) .reboot.send(RebootRequest(false, false, options.timeoutInSeconds)); return; } - if (key == LiteralStringRef("\xff\xff/reboot_and_check_worker")) { + if (key == "\xff\xff/reboot_and_check_worker"_sr) { BinaryReader::fromStringRef(value, IncludeVersion()) .reboot.send(RebootRequest(false, true)); return; diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index 8054b778c8..b4f5e9a9b7 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -98,7 +98,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() { bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) { #define TRY_PARAM(n, sn) \ - if (name == LiteralStringRef(#n) || name == LiteralStringRef(#sn)) { \ + if (name == #n || name == #sn) { \ n = value; \ return true; \ } @@ -109,7 +109,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) { TRY_PARAM(request_tries, rt); TRY_PARAM(request_timeout_min, rtom); // TODO: For backward compatibility because request_timeout was renamed to request_timeout_min - if (name == LiteralStringRef("request_timeout") || name == LiteralStringRef("rto")) { + if (name == "request_timeout"_sr || name == "rto"_sr) { request_timeout_min = value; return true; } @@ -187,7 +187,7 @@ std::string guessRegionFromDomain(std::string domain) { StringRef h(domain.c_str() + p); - if (!h.startsWith(LiteralStringRef("oss-"))) { + if (!h.startsWith("oss-"_sr)) { h.eat(service); // ignore s3 service } @@ -208,7 +208,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string try { StringRef t(url); StringRef prefix = t.eat("://"); - if (prefix != LiteralStringRef("blobstore")) + if (prefix != "blobstore"_sr) throw format("Invalid blobstore URL prefix '%s'", prefix.toString().c_str()); Optional proxyHost, proxyPort; @@ -261,7 +261,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string StringRef value = t.eat("&"); // Special case for header - if (name == LiteralStringRef("header")) { + if (name == "header"_sr) { StringRef originalValue = value; StringRef headerFieldName = value.eat(":"); StringRef headerFieldValue = value; @@ -282,7 +282,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string } // overwrite s3 region from parameter - if (name == LiteralStringRef("region")) { + if (name == "region"_sr) { region = value.toString(); continue; } @@ -476,7 +476,7 @@ ACTOR Future deleteRecursively_impl(Reference b, state Future done = b->listObjectsStream(bucket, resultStream, prefix, '/', std::numeric_limits::max()); // Wrap done in an actor which will send end_of_stream since listObjectsStream() does not (so that many calls can // write to the same stream) - done = map(done, [=](Void) { + done = map(done, [=](Void) mutable { resultStream.sendError(end_of_stream()); return Void(); }); @@ -735,16 +735,21 @@ ACTOR Future connect_impl(Referenceknobs.secure_connection ? "https" : "http"; } bool isTLS = b->knobs.secure_connection == 1; + state Reference conn; if (b->useProxy) { - // TODO(renxuan): Support http proxy + TLS - if (isTLS || b->service == "443") { - fprintf(stderr, "ERROR: TLS is not supported yet when using HTTP proxy.\n"); - throw connection_failed(); + if (isTLS) { + Reference _conn = + wait(HTTP::proxyConnect(host, service, b->proxyHost.get(), b->proxyPort.get())); + conn = _conn; + } else { + host = b->proxyHost.get(); + service = b->proxyPort.get(); + Reference _conn = wait(INetworkConnections::net()->connect(host, service, false)); + conn = _conn; } - host = b->proxyHost.get(); - service = b->proxyPort.get(); + } else { + wait(store(conn, INetworkConnections::net()->connect(host, service, isTLS))); } - state Reference conn = wait(INetworkConnections::net()->connect(host, service, isTLS)); wait(conn->connectHandshake()); TraceEvent("S3BlobStoreEndpointNewConnection") @@ -892,7 +897,7 @@ ACTOR Future> doRequest_impl(ReferenceuseProxy) { + if (bstore->useProxy && bstore->knobs.secure_connection == 0) { // Has to be in absolute-form. canonicalURI = "http://" + bstore->host + ":" + bstore->service + canonicalURI; } @@ -1188,7 +1193,7 @@ ACTOR Future listObjects_impl(ReferencelistObjectsStream(bucket, resultStream, prefix, delimiter, maxDepth, recurseFilter); // Wrap done in an actor which sends end_of_stream because list does not so that many lists can write to the same // stream - done = map(done, [=](Void) { + done = map(done, [=](Void) mutable { resultStream.sendError(end_of_stream()); return Void(); }); @@ -1423,7 +1428,7 @@ void S3BlobStoreEndpoint::setV4AuthHeaders(std::string const& verb, if (headers.find("Content-MD5") != headers.end()) headersList.push_back({ "content-md5", trim_copy(headers["Content-MD5"]) + "\n" }); for (auto h : headers) { - if (StringRef(h.first).startsWith(LiteralStringRef("x-amz"))) + if (StringRef(h.first).startsWith("x-amz"_sr)) headersList.push_back({ to_lower_copy(h.first), trim_copy(h.second) + "\n" }); } std::sort(headersList.begin(), headersList.end()); @@ -1484,7 +1489,7 @@ void S3BlobStoreEndpoint::setAuthHeaders(std::string const& verb, std::string co msg.append("\n"); for (auto h : headers) { StringRef name = h.first; - if (name.startsWith(LiteralStringRef("x-amz")) || name.startsWith(LiteralStringRef("x-icloud"))) { + if (name.startsWith("x-amz"_sr) || name.startsWith("x-icloud"_sr)) { msg.append(h.first); msg.append(":"); msg.append(h.second); @@ -1754,7 +1759,7 @@ Future S3BlobStoreEndpoint::finishMultiPartUpload(std::string const& bucke } TEST_CASE("/backup/s3/v4headers") { - S3BlobStoreEndpoint::Credentials creds{ "AKIAIOSFODNN7EXAMPLE", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "" } + S3BlobStoreEndpoint::Credentials creds{ "AKIAIOSFODNN7EXAMPLE", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "" }; // GET without query parameters { S3BlobStoreEndpoint s3("s3.amazonaws.com", "443", "amazonaws", "proxy", "port", creds); diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 7f3a3c658b..5576d22783 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -21,7 +21,7 @@ #include "fdbclient/Schemas.h" // NOTE: also change mr-status-json-schemas.rst.inc -const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( +const KeyRef JSONSchemas::statusSchema = R"statusSchema( { "cluster":{ "storage_wiggler": { @@ -137,6 +137,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "blob_manager", "blob_worker", "encrypt_key_proxy", + "consistency_scan", "storage_cache", "router", "coordinator" @@ -561,6 +562,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "unreachable_ratekeeper_worker", "unreachable_blobManager_worker", "unreachable_encryptKeyProxy_worker", + "unreachable_consistencyScan_worker", "unreadable_configuration", "full_replication_timeout", "client_issues", @@ -600,7 +602,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( } ], )statusSchema" - R"statusSchema( + R"statusSchema( "recovery_state":{ "seconds_since_last_recovered":1, "required_resolvers":1, @@ -848,8 +850,26 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "disabled", "optional_experimental", "required_experimental" + ]}, + "encryption_at_rest_mode": { + "$enum":[ + "disabled", + "aes_256_ctr" ]} }, + "consistency_scan_info":{ + "consistency_scan_enabled":false, + "restart":false, + "max_rate":0, + "target_interval":0, + "bytes_read_prev_round":0, + "last_round_start_datetime":"2022-04-20 00:05:05.123 +0000", + "last_round_finish_datetime":"1970-01-01 00:00:00.000 +0000", + "last_round_start_timestamp":1648857905.123, + "last_round_finish_timestamp":0, + "smoothed_round_seconds":1, + "finished_rounds":1 + }, "data":{ "least_operating_space_bytes_log_server":0, "average_partition_size_bytes":0, @@ -944,6 +964,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( }, "tenants":{ "num_tenants":0 + }, + "metacluster" : { + "cluster_type" : "standalone" } }, "client":{ @@ -985,9 +1008,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "up_to_date":true } } -})statusSchema"); +})statusSchema"_sr; -const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"configSchema( +const KeyRef JSONSchemas::clusterConfigurationSchema = R"configSchema( { "create":{ "$enum":[ @@ -1057,9 +1080,9 @@ const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"config "auto_logs":3, "commit_proxies":5, "grv_proxies":1 -})configSchema"); +})configSchema"_sr; -const KeyRef JSONSchemas::latencyBandConfigurationSchema = LiteralStringRef(R"configSchema( +const KeyRef JSONSchemas::latencyBandConfigurationSchema = R"configSchema( { "get_read_version":{ "bands":[ @@ -1079,30 +1102,30 @@ const KeyRef JSONSchemas::latencyBandConfigurationSchema = LiteralStringRef(R"co ], "max_commit_bytes":0 } -})configSchema"); +})configSchema"_sr; -const KeyRef JSONSchemas::dataDistributionStatsSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::dataDistributionStatsSchema = R"""( { "shard_bytes": 1947000 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::logHealthSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::logHealthSchema = R"""( { "log_queue": 156 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::storageHealthSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::storageHealthSchema = R"""( { "cpu_usage": 3.28629447047675, "disk_usage": 0.19997897369207954, "storage_durability_lag": 5050809, "storage_queue": 2030 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::aggregateHealthSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::aggregateHealthSchema = R"""( { "batch_limited": false, "limiting_storage_durability_lag": 5050809, @@ -1112,12 +1135,12 @@ const KeyRef JSONSchemas::aggregateHealthSchema = LiteralStringRef(R"""( "worst_storage_queue": 2030, "worst_log_queue": 156 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::managementApiErrorSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::managementApiErrorSchema = R"""( { "retriable": false, "command": "exclude", "message": "The reason of the error" } -)"""); +)"""_sr; diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 58cfc1c866..6a59fb99dc 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/ServerKnobs.h" +#include "flow/CompressionUtils.h" #include "flow/IRandom.h" #include "flow/flow.h" @@ -89,7 +90,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_CACHE_VERSIONS, 10e6 ); init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); - init( TLOG_MAX_CREATE_DURATION, 10.0 ); + // In some rare simulation tests, particularly with log_spill:=1 configured, the 10 second limit is exceeded, causing SevError trace events + // and simulation test failure. Increasing the knob value to 15.0 in simulation is a workaround to avoid these failures. + init( TLOG_MAX_CREATE_DURATION, 10.0 ); if (isSimulated) TLOG_MAX_CREATE_DURATION = 15.0; init( PEEK_LOGGING_AMOUNT, 5 ); init( PEEK_LOGGING_DELAY, 5.0 ); init( PEEK_RESET_INTERVAL, 300.0 ); if ( randomize && BUGGIFY ) PEEK_RESET_INTERVAL = 20.0; @@ -159,9 +162,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( PRIORITY_TEAM_FAILED, 805 ); init( PRIORITY_TEAM_0_LEFT, 809 ); init( PRIORITY_SPLIT_SHARD, 950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350; + init( PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD, 960 ); if( randomize && BUGGIFY ) PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD = 360; // Set as the lowest priority // Data distribution init( SHARD_ENCODE_LOCATION_METADATA, false ); if( randomize && BUGGIFY ) SHARD_ENCODE_LOCATION_METADATA = true; + init( ENABLE_DD_PHYSICAL_SHARD, false ); // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true; When true, optimization of data move between DCs is disabled + init( MAX_PHYSICAL_SHARD_BYTES, 500000000 ); // 500 MB; for ENABLE_DD_PHYSICAL_SHARD; smaller leads to larger number of physicalShard per storage server + init( PHYSICAL_SHARD_METRICS_DELAY, 300.0 ); // 300 seconds; for ENABLE_DD_PHYSICAL_SHARD + init( ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME, 600.0 ); if( randomize && BUGGIFY ) ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME = 0.0; // 600 seconds; for ENABLE_DD_PHYSICAL_SHARD init( READ_REBALANCE_CPU_THRESHOLD, 15.0 ); init( READ_REBALANCE_SRC_PARALLELISM, 20 ); init( READ_REBALANCE_SHARD_TOPK, READ_REBALANCE_SRC_PARALLELISM * 2 ); @@ -269,7 +277,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) DD_FAILURE_TIME = 10.0; init( DD_ZERO_HEALTHY_TEAM_DELAY, 1.0 ); init( REMOTE_KV_STORE, false ); - init( REMOTE_KV_STORE_INIT_DELAY, 0.1 ); + init( REBOOT_KV_STORE_DELAY, 0.1 ); init( REMOTE_KV_STORE_MAX_INIT_DURATION, 10.0 ); init( REBALANCE_MAX_RETRIES, 100 ); init( DD_OVERLAP_PENALTY, 10000 ); @@ -287,7 +295,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 ); init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_TENANT_AWARENESS_ENABLED, false ); - init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2.0 ); + init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true @@ -365,19 +374,27 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REPLACE_CONTENTS_BYTES, 1e5 ); // KeyValueStoreRocksDB + init( ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES, true ); if( randomize && BUGGIFY ) ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES = false; + init( ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE, true ); if( randomize && BUGGIFY ) ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE = false; + init( ROCKSDB_READ_RANGE_ROW_LIMIT, 65535 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ROW_LIMIT = deterministicRandom()->randomInt(2, 10); + init( ROCKSDB_READER_THREAD_PRIORITY, 0 ); + init( ROCKSDB_WRITER_THREAD_PRIORITY, 0 ); init( ROCKSDB_BACKGROUND_PARALLELISM, 4 ); init( ROCKSDB_READ_PARALLELISM, 4 ); // Use a smaller memtable in simulation to avoid OOMs. int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024; init( ROCKSDB_MEMTABLE_BYTES, memtableBytes ); + init( ROCKSDB_LEVEL_STYLE_COMPACTION, true ); init( ROCKSDB_UNSAFE_AUTO_FSYNC, false ); init( ROCKSDB_PERIODIC_COMPACTION_SECONDS, 0 ); init( ROCKSDB_PREFIX_LEN, 0 ); - init( ROCKSDB_BLOCK_CACHE_SIZE, 0 ); + // If rocksdb block cache size is 0, the default 8MB is used. + int64_t blockCacheSize = isSimulated ? 0 : 1024 * 1024 * 1024 /* 1GB */; + init( ROCKSDB_BLOCK_CACHE_SIZE, blockCacheSize ); init( ROCKSDB_METRICS_DELAY, 60.0 ); - init( ROCKSDB_READ_VALUE_TIMEOUT, 5.0 ); - init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, 5.0 ); - init( ROCKSDB_READ_RANGE_TIMEOUT, 5.0 ); + init( ROCKSDB_READ_VALUE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + init( ROCKSDB_READ_RANGE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); init( ROCKSDB_READ_QUEUE_WAIT, 1.0 ); init( ROCKSDB_READ_QUEUE_HARD_MAX, 1000 ); init( ROCKSDB_READ_QUEUE_SOFT_MAX, 500 ); @@ -391,6 +408,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO. init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE, true ); init( DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, "fdb"); + init( ROCKSDB_DISABLE_AUTO_COMPACTIONS, false ); // RocksDB default init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true; init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 ); @@ -399,6 +417,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT, 64000000000 ); // 64GB, Rocksdb option, Writes will slow down. init( ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT, 100000000000 ); // 100GB, Rocksdb option, Writes will stall. init( ROCKSDB_CAN_COMMIT_COMPACT_BYTES_LIMIT, 50000000000 ); // 50GB, Commit waits. + // Enable this knob only for experminatal purpose, never enable this in production. + // If enabled, all the committed in-memory memtable writes are lost on a crash. + init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false ); // Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for // ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded. // Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable @@ -408,9 +429,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_BLOCK_SIZE, 32768 ); // 32 KB, size of the block in rocksdb cache. init( ENABLE_SHARDED_ROCKSDB, false ); init( ROCKSDB_WRITE_BUFFER_SIZE, 1 << 30 ); // 1G + init( ROCKSDB_CF_WRITE_BUFFER_SIZE, 64 << 20 ); // 64M, RocksDB default. init( ROCKSDB_MAX_TOTAL_WAL_SIZE, 0 ); // RocksDB default. init( ROCKSDB_MAX_BACKGROUND_JOBS, 2 ); // RocksDB default. init( ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD, 21600 ); // 6h, RocksDB default. + init( ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY, isSimulated ? 10.0 : 300.0 ); // Delays shard clean up, must be larger than ROCKSDB_READ_VALUE_TIMEOUT to prevent reading deleted shard. // Leader election bool longLeaderElection = randomize && BUGGIFY; @@ -474,7 +497,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REPORT_TRANSACTION_COST_ESTIMATION_DELAY, 0.1 ); init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG, true ); - bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST; + bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST; init( PROXY_USE_RESOLVER_PRIVATE_MUTATIONS, false ); if( buggfyUseResolverPrivateMutations ) PROXY_USE_RESOLVER_PRIVATE_MUTATIONS = deterministicRandom()->coinflip(); init( RESET_MASTER_BATCHES, 200 ); @@ -534,6 +557,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ATTEMPT_RECRUITMENT_DELAY, 0.035 ); init( WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 1.0 ); init( WAIT_FOR_RATEKEEPER_JOIN_DELAY, 1.0 ); + init( WAIT_FOR_CONSISTENCYSCAN_JOIN_DELAY, 1.0 ); init( WAIT_FOR_BLOB_MANAGER_JOIN_DELAY, 1.0 ); init( WAIT_FOR_ENCRYPT_KEY_PROXY_JOIN_DELAY, 1.0 ); init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0; @@ -544,6 +568,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( CHECK_REMOTE_HEALTH_INTERVAL, 60 ); init( FORCE_RECOVERY_CHECK_DELAY, 5.0 ); init( RATEKEEPER_FAILURE_TIME, 1.0 ); + init( CONSISTENCYSCAN_FAILURE_TIME, 1.0 ); init( BLOB_MANAGER_FAILURE_TIME, 1.0 ); init( REPLACE_INTERFACE_DELAY, 60.0 ); init( REPLACE_INTERFACE_CHECK_DELAY, 5.0 ); @@ -614,6 +639,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( RATEKEEPER_PRINT_LIMIT_REASON, false ); if( randomize && BUGGIFY ) RATEKEEPER_PRINT_LIMIT_REASON = true; init( RATEKEEPER_MIN_RATE, 0.0 ); init( RATEKEEPER_MAX_RATE, 1e9 ); + init( RATEKEEPER_BATCH_MIN_RATE, 0.0 ); + init( RATEKEEPER_BATCH_MAX_RATE, 1e9 ); bool smallStorageTarget = randomize && BUGGIFY; init( TARGET_BYTES_PER_STORAGE_SERVER, 1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3; @@ -623,9 +650,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( SPRING_BYTES_STORAGE_SERVER_BATCH, 100e6 ); if( smallStorageTarget ) SPRING_BYTES_STORAGE_SERVER_BATCH = 150e3; init( STORAGE_HARD_LIMIT_BYTES, 1500e6 ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES = 4500e3; init( STORAGE_HARD_LIMIT_BYTES_OVERAGE, 5000e3 ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES_OVERAGE = 100e3; // byte+version overage ensures storage server makes enough progress on freeing up storage queue memory at hard limit by ensuring it advances desiredOldestVersion enough per commit cycle. + init( STORAGE_HARD_LIMIT_BYTES_SPEED_UP_SIM, STORAGE_HARD_LIMIT_BYTES ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES_SPEED_UP_SIM *= 10; + init( STORAGE_HARD_LIMIT_BYTES_OVERAGE_SPEED_UP_SIM, STORAGE_HARD_LIMIT_BYTES_OVERAGE ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES_OVERAGE_SPEED_UP_SIM *= 10; init( STORAGE_HARD_LIMIT_VERSION_OVERAGE, VERSIONS_PER_SECOND / 4.0 ); init( STORAGE_DURABILITY_LAG_HARD_MAX, 2000e6 ); if( smallStorageTarget ) STORAGE_DURABILITY_LAG_HARD_MAX = 100e6; init( STORAGE_DURABILITY_LAG_SOFT_MAX, 250e6 ); if( smallStorageTarget ) STORAGE_DURABILITY_LAG_SOFT_MAX = 10e6; + init( STORAGE_INCLUDE_FEED_STORAGE_QUEUE, true ); if ( randomize && BUGGIFY ) STORAGE_INCLUDE_FEED_STORAGE_QUEUE = false; //FIXME: Low priority reads are disabled by assigning very high knob values, reduce knobs for 7.0 init( LOW_PRIORITY_STORAGE_QUEUE_BYTES, 775e8 ); if( smallStorageTarget ) LOW_PRIORITY_STORAGE_QUEUE_BYTES = 1750e3; @@ -664,15 +694,19 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DURABILITY_LAG_INCREASE_RATE, 1.001 ); init( STORAGE_SERVER_LIST_FETCH_TIMEOUT, 20.0 ); init( BW_THROTTLING_ENABLED, true ); - init( TARGET_BW_LAG, 50.0 ); - init( TARGET_BW_LAG_BATCH, 20.0 ); - init( TARGET_BW_LAG_UPDATE, 9.0 ); + + bool buggifySmallBWLag = randomize && BUGGIFY; + init( TARGET_BW_LAG, 240.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0; + init( TARGET_BW_LAG_BATCH, 200.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0; + init( TARGET_BW_LAG_UPDATE, 9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0; init( MIN_BW_HISTORY, 10 ); - init( BW_ESTIMATION_INTERVAL, 10.0 ); + init( BW_ESTIMATION_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0; init( BW_LAG_INCREASE_AMOUNT, 1.1 ); init( BW_LAG_DECREASE_AMOUNT, 0.9 ); init( BW_FETCH_WORKERS_INTERVAL, 5.0 ); init( BW_RW_LOGGING_INTERVAL, 5.0 ); + init( BW_MAX_BLOCKED_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_MAX_BLOCKED_INTERVAL = 2.0; + init( BW_RK_SIM_QUIESCE_DELAY, 150.0 ); init( MAX_AUTO_THROTTLED_TRANSACTION_TAGS, 5 ); if(randomize && BUGGIFY) MAX_AUTO_THROTTLED_TRANSACTION_TAGS = 1; init( MAX_MANUAL_THROTTLED_TRANSACTION_TAGS, 40 ); if(randomize && BUGGIFY) MAX_MANUAL_THROTTLED_TRANSACTION_TAGS = 1; @@ -690,6 +724,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false ); init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); + init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 ); //Storage Metrics init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 ); @@ -710,12 +745,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_LIMIT_BYTES, 500000 ); init( BUGGIFY_LIMIT_BYTES, 1000 ); init( FETCH_USING_STREAMING, false ); if( randomize && isSimulated && BUGGIFY ) FETCH_USING_STREAMING = true; //Determines if fetch keys uses streaming reads + init( FETCH_USING_BLOB, false ); init( FETCH_BLOCK_BYTES, 2e6 ); init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6; init( FETCH_KEYS_PARALLELISM, 2 ); - init( FETCH_KEYS_PARALLELISM_FULL, 10 ); + init( FETCH_KEYS_PARALLELISM_FULL, 6 ); init( FETCH_KEYS_LOWER_PRIORITY, 0 ); init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 ); + init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20; init( BUGGIFY_BLOCK_BYTES, 10000 ); init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS ); init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000; @@ -738,7 +775,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MIN_TAG_READ_PAGES_RATE, 1.0e4 ); if( randomize && BUGGIFY ) MIN_TAG_READ_PAGES_RATE = 0; init( MIN_TAG_WRITE_PAGES_RATE, 3200 ); if( randomize && BUGGIFY ) MIN_TAG_WRITE_PAGES_RATE = 0; init( TAG_MEASUREMENT_INTERVAL, 30.0 ); if( randomize && BUGGIFY ) TAG_MEASUREMENT_INTERVAL = 1.0; - init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096; init( PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS, true ); if( randomize && BUGGIFY ) PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS = false; init( REPORT_DD_METRICS, true ); init( DD_METRICS_REPORT_INTERVAL, 30.0 ); @@ -754,7 +790,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); - init( STORAGE_SERVER_SHARD_AWARE, true ); + init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 ); //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; @@ -775,13 +811,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ENABLE_WORKER_HEALTH_MONITOR, false ); init( WORKER_HEALTH_MONITOR_INTERVAL, 60.0 ); init( PEER_LATENCY_CHECK_MIN_POPULATION, 30 ); - init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.90 ); + init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.50 ); init( PEER_LATENCY_DEGRADATION_THRESHOLD, 0.05 ); - init( PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE, 0.90 ); + init( PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE, 0.50 ); init( PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE, 0.1 ); init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD, 0.1 ); - init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 1 ); + init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 5 ); init( WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER, true ); + init( STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT, false ); if ( randomize && BUGGIFY ) STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT = true; // Test harness init( WORKER_POLL_DELAY, 1.0 ); @@ -794,6 +831,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // Dynamic Knobs (implementation) init( COMPACTION_INTERVAL, isSimulated ? 5.0 : 300.0 ); + init( BROADCASTER_SELF_UPDATE_DELAY, 1.0 ); init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 ); init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT, 3.0 ); init( FETCH_CHANGES_TIMEOUT, 3.0 ); @@ -877,6 +915,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } + init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false ); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); @@ -886,19 +925,19 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX, "Master" ); // Encryption - init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) { ENABLE_ENCRYPTION = deterministicRandom()->coinflip(); } + init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) ENABLE_ENCRYPTION = !ENABLE_ENCRYPTION; init( ENCRYPTION_MODE, "AES-256-CTR" ); init( SIM_KMS_MAX_KEYS, 4096 ); init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000 ); - init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); } - init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_ENCRYPTION = (ENABLE_ENCRYPTION && deterministicRandom()->coinflip()); } + init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true; + init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION; + init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION; // encrypt key proxy init( ENABLE_BLOB_GRANULE_COMPRESSION, false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); } - init( BLOB_GRANULE_COMPRESSION_FILTER, "GZIP" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = "NONE"; } + init( BLOB_GRANULE_COMPRESSION_FILTER, "NONE" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = CompressionUtils::toString(CompressionUtils::getRandomFilter()); } - - // KMS connector type + // KMS connector type init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" ); // Blob granlues @@ -932,11 +971,15 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BLOB_WORKER_BATCH_GRV_INTERVAL, 0.1 ); init( BLOB_WORKER_DO_REJECT_WHEN_FULL, true ); if ( randomize && BUGGIFY ) BLOB_WORKER_DO_REJECT_WHEN_FULL = false; init( BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD, 0.9 ); + init( BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY, 30.0 ); if ( randomize && BUGGIFY ) BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY = deterministicRandom()->randomInt(0, 10) - 1; init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN, 0.1 ); init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX, 5.0 ); init( BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT, 1.5 ); init( BLOB_MANAGER_CONCURRENT_MERGE_CHECKS, 64 ); if( randomize && BUGGIFY ) BLOB_MANAGER_CONCURRENT_MERGE_CHECKS = 1 << deterministicRandom()->randomInt(0, 7); + init( BLOB_MANIFEST_BACKUP, false ); + init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 ); + init( BLOB_FULL_RESTORE_MODE, false ); init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 ); init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 ); diff --git a/fdbclient/SimpleConfigTransaction.actor.cpp b/fdbclient/SimpleConfigTransaction.actor.cpp index cea49019bc..dba5d327b7 100644 --- a/fdbclient/SimpleConfigTransaction.actor.cpp +++ b/fdbclient/SimpleConfigTransaction.actor.cpp @@ -43,11 +43,13 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetGenerationReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetGenerationRequest{}, + retryGetReplyFromHostname(ConfigTransactionGetGenerationRequest{ 0, Optional() }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GETGENERATION))); } else { - wait(store(reply, retryBrokenPromise(self->cti.getGeneration, ConfigTransactionGetGenerationRequest{}))); + wait(store(reply, + retryBrokenPromise(self->cti.getGeneration, + ConfigTransactionGetGenerationRequest{ 0, Optional() }))); } if (self->dID.present()) { TraceEvent("SimpleConfigTransactionGotReadVersion", self->dID.get()) @@ -70,11 +72,12 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetRequest{ generation, configKey }, + retryGetReplyFromHostname(ConfigTransactionGetRequest{ 0, generation, configKey }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GET))); } else { - wait(store(reply, retryBrokenPromise(self->cti.get, ConfigTransactionGetRequest{ generation, configKey }))); + wait(store(reply, + retryBrokenPromise(self->cti.get, ConfigTransactionGetRequest{ 0, generation, configKey }))); } if (self->dID.present()) { TraceEvent("SimpleConfigTransactionGotValue", self->dID.get()) @@ -95,13 +98,13 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetConfigClassesReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetConfigClassesRequest{ generation }, + retryGetReplyFromHostname(ConfigTransactionGetConfigClassesRequest{ 0, generation }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES))); } else { wait(store( reply, - retryBrokenPromise(self->cti.getClasses, ConfigTransactionGetConfigClassesRequest{ generation }))); + retryBrokenPromise(self->cti.getClasses, ConfigTransactionGetConfigClassesRequest{ 0, generation }))); } RangeResult result; for (const auto& configClass : reply.configClasses) { @@ -118,13 +121,13 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetKnobsReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetKnobsRequest{ generation, configClass }, + retryGetReplyFromHostname(ConfigTransactionGetKnobsRequest{ 0, generation, configClass }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS))); } else { - wait(store( - reply, - retryBrokenPromise(self->cti.getKnobs, ConfigTransactionGetKnobsRequest{ generation, configClass }))); + wait(store(reply, + retryBrokenPromise(self->cti.getKnobs, + ConfigTransactionGetKnobsRequest{ 0, generation, configClass }))); } RangeResult result; for (const auto& knobName : reply.knobNames) { @@ -137,6 +140,7 @@ class SimpleConfigTransactionImpl { if (!self->getGenerationFuture.isValid()) { self->getGenerationFuture = getGeneration(self); } + self->toCommit.coordinatorsHash = 0; wait(store(self->toCommit.generation, self->getGenerationFuture)); self->toCommit.annotation.timestamp = now(); if (self->cti.hostname.present()) { diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 719aff9fe8..b95a8da18f 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -21,10 +21,12 @@ #include "boost/lexical_cast.hpp" #include "boost/algorithm/string.hpp" +#include #include #include #include +#include #include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" @@ -56,65 +58,46 @@ static bool isAlphaNumeric(const std::string& key) { } // namespace std::unordered_map SpecialKeySpace::moduleToBoundary = { - { SpecialKeySpace::MODULE::TRANSACTION, - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")) }, + { SpecialKeySpace::MODULE::TRANSACTION, KeyRangeRef("\xff\xff/transaction/"_sr, "\xff\xff/transaction0"_sr) }, { SpecialKeySpace::MODULE::WORKERINTERFACE, - KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")) }, - { SpecialKeySpace::MODULE::STATUSJSON, singleKeyRange(LiteralStringRef("\xff\xff/status/json")) }, - { SpecialKeySpace::MODULE::CONNECTIONSTRING, singleKeyRange(LiteralStringRef("\xff\xff/connection_string")) }, - { SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange(LiteralStringRef("\xff\xff/cluster_file_path")) }, - { SpecialKeySpace::MODULE::METRICS, - KeyRangeRef(LiteralStringRef("\xff\xff/metrics/"), LiteralStringRef("\xff\xff/metrics0")) }, - { SpecialKeySpace::MODULE::MANAGEMENT, - KeyRangeRef(LiteralStringRef("\xff\xff/management/"), LiteralStringRef("\xff\xff/management0")) }, - { SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange(LiteralStringRef("\xff\xff/error_message")) }, - { SpecialKeySpace::MODULE::CONFIGURATION, - KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) }, - { SpecialKeySpace::MODULE::GLOBALCONFIG, - KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) }, - { SpecialKeySpace::MODULE::TRACING, - KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }, - { SpecialKeySpace::MODULE::ACTORLINEAGE, - KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }, + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr) }, + { SpecialKeySpace::MODULE::STATUSJSON, singleKeyRange("\xff\xff/status/json"_sr) }, + { SpecialKeySpace::MODULE::CONNECTIONSTRING, singleKeyRange("\xff\xff/connection_string"_sr) }, + { SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange("\xff\xff/cluster_file_path"_sr) }, + { SpecialKeySpace::MODULE::METRICS, KeyRangeRef("\xff\xff/metrics/"_sr, "\xff\xff/metrics0"_sr) }, + { SpecialKeySpace::MODULE::MANAGEMENT, KeyRangeRef("\xff\xff/management/"_sr, "\xff\xff/management0"_sr) }, + { SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange("\xff\xff/error_message"_sr) }, + { SpecialKeySpace::MODULE::CONFIGURATION, KeyRangeRef("\xff\xff/configuration/"_sr, "\xff\xff/configuration0"_sr) }, + { SpecialKeySpace::MODULE::GLOBALCONFIG, KeyRangeRef("\xff\xff/global_config/"_sr, "\xff\xff/global_config0"_sr) }, + { SpecialKeySpace::MODULE::TRACING, KeyRangeRef("\xff\xff/tracing/"_sr, "\xff\xff/tracing0"_sr) }, + { SpecialKeySpace::MODULE::ACTORLINEAGE, KeyRangeRef("\xff\xff/actor_lineage/"_sr, "\xff\xff/actor_lineage0"_sr) }, { SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF, - KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"), - LiteralStringRef("\xff\xff/actor_profiler_conf0")) }, - { SpecialKeySpace::MODULE::CLUSTERID, singleKeyRange(LiteralStringRef("\xff\xff/cluster_id")) }, + KeyRangeRef("\xff\xff/actor_profiler_conf/"_sr, "\xff\xff/actor_profiler_conf0"_sr) }, + { SpecialKeySpace::MODULE::CLUSTERID, singleKeyRange("\xff\xff/cluster_id"_sr) }, }; std::unordered_map SpecialKeySpace::managementApiCommandToRange = { - { "exclude", - KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "failed", - KeyRangeRef(LiteralStringRef("failed/"), LiteralStringRef("failed0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "exclude", KeyRangeRef("excluded/"_sr, "excluded0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "failed", KeyRangeRef("failed/"_sr, "failed0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "excludedlocality", - KeyRangeRef(LiteralStringRef("excluded_locality/"), LiteralStringRef("excluded_locality0")) + KeyRangeRef("excluded_locality/"_sr, "excluded_locality0"_sr) .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "failedlocality", - KeyRangeRef(LiteralStringRef("failed_locality/"), LiteralStringRef("failed_locality0")) + KeyRangeRef("failed_locality/"_sr, "failed_locality0"_sr) .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "lock", singleKeyRange(LiteralStringRef("db_locked")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "lock", singleKeyRange("db_locked"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "consistencycheck", - singleKeyRange(LiteralStringRef("consistency_check_suspended")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + singleKeyRange("consistency_check_suspended"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "coordinators", - KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0")) - .withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) }, + KeyRangeRef("coordinators/"_sr, "coordinators0"_sr).withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) }, { "advanceversion", - singleKeyRange(LiteralStringRef("min_required_commit_version")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "versionepoch", - singleKeyRange(LiteralStringRef("version_epoch")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "profile", - KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + singleKeyRange("min_required_commit_version"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "versionepoch", singleKeyRange("version_epoch"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "profile", KeyRangeRef("profiling/"_sr, "profiling0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "maintenance", - KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + KeyRangeRef("maintenance/"_sr, "maintenance0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "datadistribution", - KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) + KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "tenant", KeyRangeRef("tenant/"_sr, "tenant0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "tenantmap", @@ -122,12 +105,8 @@ std::unordered_map SpecialKeySpace::managementApiCommandT }; std::unordered_map SpecialKeySpace::actorLineageApiCommandToRange = { - { "state", - KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0")) - .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }, - { "time", - KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0")) - .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) } + { "state", KeyRangeRef("state/"_sr, "state0"_sr).withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }, + { "time", KeyRangeRef("time/"_sr, "time0"_sr).withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) } }; std::set SpecialKeySpace::options = { "excluded/force", @@ -156,6 +135,11 @@ ACTOR Future moveKeySelectorOverRangeActor(const SpecialKeyRangeReadImpl* // never being called if KeySelector is already normalized ASSERT(ks->offset != 1); + // Throw error if module doesn't support tenants and we have a tenant + if (ryw->getTenant().present() && !skrImpl->supportsTenants()) { + throw illegal_tenant_access(); + } + state Key startKey(skrImpl->getKeyRange().begin); state Key endKey(skrImpl->getKeyRange().end); state RangeResult result; @@ -376,6 +360,21 @@ ACTOR Future SpecialKeySpace::getRangeAggregationActor(SpecialKeySp } state RangeMap::Ranges ranges = sks->getReadImpls().intersectingRanges(KeyRangeRef(begin.getKey(), end.getKey())); + + // Check tenant legality separately from below iterations + // because it may be partially completed and returned + // before illegal range is checked due to the limits handler + if (ryw->getTenant().present()) { + for (auto iter : ranges) { + if (iter->value() == nullptr) { + continue; + } + if (!iter->value()->supportsTenants()) { + throw illegal_tenant_access(); + } + } + } + // TODO : workaround to write this two together to make the code compact // The issue here is boost::iterator_range<> doest not provide rbegin(), rend() iter = reverse ? ranges.end() : ranges.begin(); @@ -455,7 +454,7 @@ Future SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw, if (!limits.isValid()) return range_limits_invalid(); if (limits.isReached()) { - CODE_PROBE(true, "read limit 0"); + CODE_PROBE(true, "Special Key Space range read limit 0"); return RangeResult(); } // make sure orEqual == false @@ -501,6 +500,9 @@ void SpecialKeySpace::set(ReadYourWritesTransaction* ryw, const KeyRef& key, con .detail("Value", value.toString()); throw special_keys_no_write_module_found(); } + if (!impl->supportsTenants() && ryw->getTenant().present()) { + throw illegal_tenant_access(); + } return impl->set(ryw, key, value); } @@ -518,6 +520,9 @@ void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& r TraceEvent(SevDebug, "SpecialKeySpaceNoWriteModuleFound").detail("Range", range); throw special_keys_no_write_module_found(); } + if (!begin->supportsTenants() && ryw->getTenant().present()) { + throw illegal_tenant_access(); + } return begin->clear(ryw, range); } @@ -527,6 +532,9 @@ void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) { auto impl = writeImpls[key]; if (impl == nullptr) throw special_keys_no_write_module_found(); + if (!impl->supportsTenants() && ryw->getTenant().present()) { + throw illegal_tenant_access(); + } return impl->clear(ryw, key); } @@ -538,8 +546,8 @@ bool validateSnakeCaseNaming(const KeyRef& k) { // Suffix can be \xff\xff or \x00 in single key range if (key.endsWith(specialKeys.begin)) key = key.removeSuffix(specialKeys.end); - else if (key.endsWith(LiteralStringRef("\x00"))) - key = key.removeSuffix(LiteralStringRef("\x00")); + else if (key.endsWith("\x00"_sr)) + key = key.removeSuffix("\x00"_sr); for (const char& c : key.toString()) { // only small letters, numbers, '/', '_' is allowed ASSERT((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '/' || c == '_'); @@ -614,6 +622,16 @@ ACTOR Future commitActor(SpecialKeySpace* sks, ReadYourWritesTransaction* ++iter; } state std::vector::const_iterator it; + // Check validity of tenant support before iterating through + // module ptrs and potentially getting partial commits + if (ryw->getTenant().present()) { + for (it = writeModulePtrs.begin(); it != writeModulePtrs.end(); ++it) { + if (!(*it)->supportsTenants()) { + throw illegal_tenant_access(); + } + } + } + for (it = writeModulePtrs.begin(); it != writeModulePtrs.end(); ++it) { Optional msg = wait((*it)->commit(ryw)); if (msg.present()) { @@ -712,8 +730,8 @@ ACTOR Future ddMetricsGetRangeActor(ReadYourWritesTransaction* ryw, loop { try { auto keys = kr.removePrefix(ddStatsRange.begin); - Standalone> resultWithoutPrefix = wait( - waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)); + Standalone> resultWithoutPrefix = + wait(waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->TOO_MANY)); RangeResult result; for (const auto& ddMetricsRef : resultWithoutPrefix) { // each begin key is the previous end key, thus we only encode the begin key in the result @@ -749,7 +767,7 @@ Future DDStatsRangeImpl::getRange(ReadYourWritesTransaction* ryw, } Key SpecialKeySpace::getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option) { - Key prefix = LiteralStringRef("options/").withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin); + Key prefix = "options/"_sr.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin); auto pair = command + "/" + option; ASSERT(options.find(pair) != options.end()); return prefix.withSuffix(pair); @@ -880,11 +898,11 @@ void ExcludeServersRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& Key ExcludeServersRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key ExcludeServersRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -974,21 +992,13 @@ ACTOR Future checkExclusion(Database db, state int ssTotalCount = 0; state int ssExcludedCount = 0; - state double worstFreeSpaceRatio = 1.0; + + state std::unordered_set diskLocalities; + state int64_t totalKvStoreFreeBytes = 0; + state int64_t totalKvStoreUsedBytes = 0; + state int64_t totalKvStoreUsedBytesNonExcluded = 0; try { for (auto proc : processesMap.obj()) { - bool storageServer = false; - StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); - for (StatusObjectReader role : rolesArray) { - if (role["role"].get_str() == "storage") { - storageServer = true; - break; - } - } - // Skip non-storage servers in free space calculation - if (!storageServer) - continue; - StatusObjectReader process(proc.second); std::string addrStr; if (!process.get("address", addrStr)) { @@ -998,33 +1008,49 @@ ACTOR Future checkExclusion(Database db, NetworkAddress addr = NetworkAddress::parse(addrStr); bool excluded = (process.has("excluded") && process.last().get_bool()) || addressExcluded(*exclusions, addr); - ssTotalCount++; - if (excluded) - ssExcludedCount++; - if (!excluded) { - StatusObjectReader disk; - if (!process.get("disk", disk)) { - *msg = - ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString); - return false; + StatusObjectReader localityObj; + std::string disk_id; + if (process.get("locality", localityObj)) { + process.get("disk_id", disk_id); // its ok if we don't have this field + } + + StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); + for (StatusObjectReader role : rolesArray) { + if (role["role"].get_str() == "storage") { + ssTotalCount++; + + int64_t used_bytes; + if (!role.get("kvstore_used_bytes", used_bytes)) { + *msg = ManagementAPIError::toJsonString( + false, markFailed ? "exclude failed" : "exclude", errorString); + return false; + } + + int64_t free_bytes; + if (!role.get("kvstore_free_bytes", free_bytes)) { + *msg = ManagementAPIError::toJsonString( + false, markFailed ? "exclude failed" : "exclude", errorString); + return false; + } + + totalKvStoreUsedBytes += used_bytes; + + if (!excluded) { + totalKvStoreUsedBytesNonExcluded += used_bytes; + + if (disk_id.empty() || diskLocalities.find(disk_id) == diskLocalities.end()) { + totalKvStoreFreeBytes += free_bytes; + if (!disk_id.empty()) { + diskLocalities.insert(disk_id); + } + } + } } - int64_t total_bytes; - if (!disk.get("total_bytes", total_bytes)) { - *msg = - ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString); - return false; + if (excluded) { + ssExcludedCount++; } - - int64_t free_bytes; - if (!disk.get("free_bytes", free_bytes)) { - *msg = - ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString); - return false; - } - - worstFreeSpaceRatio = std::min(worstFreeSpaceRatio, double(free_bytes) / total_bytes); } } } catch (...) // std::exception @@ -1033,14 +1059,15 @@ ACTOR Future checkExclusion(Database db, return false; } - if (ssExcludedCount == ssTotalCount || - (1 - worstFreeSpaceRatio) * ssTotalCount / (ssTotalCount - ssExcludedCount) > 0.9) { + double finalFreeRatio = 1 - (totalKvStoreUsedBytes / (totalKvStoreUsedBytesNonExcluded + totalKvStoreFreeBytes)); + if (ssExcludedCount == ssTotalCount || finalFreeRatio <= 0.1) { std::string temp = "ERROR: This exclude may cause the total free space in the cluster to drop below 10%.\n" "Call set(\"0xff0xff/management/options/exclude/force\", ...) first to exclude without " "checking free space.\n"; *msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", temp); return false; } + return true; } @@ -1123,11 +1150,11 @@ void FailedServersRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& k Key FailedServersRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key FailedServersRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -1289,8 +1316,7 @@ Future> ProcessClassRangeImpl::commit(ReadYourWritesTransa // validate class type ValueRef processClassType = entry.second.get(); ProcessClass processClass(processClassType.toString(), ProcessClass::DBSource); - if (processClass.classType() == ProcessClass::InvalidClass && - processClassType != LiteralStringRef("default")) { + if (processClass.classType() == ProcessClass::InvalidClass && processClassType != "default"_sr) { std::string error = "ERROR: \'" + processClassType.toString() + "\' is not a valid process class\n"; errorMsg = ManagementAPIError::toJsonString(false, "setclass", error); return errorMsg; @@ -1390,11 +1416,10 @@ ACTOR Future> lockDatabaseCommitActor(ReadYourWritesTransa throw database_locked(); } else if (!val.present()) { // lock database - ryw->getTransaction().atomicOp(databaseLockedKey, - BinaryWriter::toValue(uid, Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), - MutationRef::SetVersionstampedValue); + ryw->getTransaction().atomicOp( + databaseLockedKey, + BinaryWriter::toValue(uid, Unversioned()).withPrefix("0123456789"_sr).withSuffix("\x00\x00\x00\x00"_sr), + MutationRef::SetVersionstampedValue); ryw->getTransaction().addWriteConflictRange(normalKeys); } @@ -1648,7 +1673,7 @@ ACTOR Future coordinatorsGetRangeActor(ReadYourWritesTransaction* r state ClusterConnectionString cs = ryw->getDatabase()->getConnectionRecord()->getConnectionString(); state std::vector coordinator_processes = wait(cs.tryResolveHostnames()); RangeResult result; - Key cluster_decription_key = prefix.withSuffix(LiteralStringRef("cluster_description")); + Key cluster_decription_key = prefix.withSuffix("cluster_description"_sr); if (kr.contains(cluster_decription_key)) { result.push_back_deep(result.arena(), KeyValueRef(cluster_decription_key, cs.clusterKeyName())); } @@ -1663,7 +1688,7 @@ ACTOR Future coordinatorsGetRangeActor(ReadYourWritesTransaction* r processes_str += ","; processes_str += w.toString(); } - Key processes_key = prefix.withSuffix(LiteralStringRef("processes")); + Key processes_key = prefix.withSuffix("processes"_sr); if (kr.contains(processes_key)) { result.push_back_deep(result.arena(), KeyValueRef(processes_key, Value(processes_str))); } @@ -1685,7 +1710,7 @@ ACTOR static Future> coordinatorsCommitActor(ReadYourWrite state bool parse_error = false; // check update for coordinators - Key processes_key = LiteralStringRef("processes").withPrefix(kr.begin); + Key processes_key = "processes"_sr.withPrefix(kr.begin); auto processes_entry = ryw->getSpecialKeySpaceWriteMap()[processes_key]; if (processes_entry.first) { ASSERT(processes_entry.second.present()); // no clear should be seen here @@ -1725,7 +1750,7 @@ ACTOR static Future> coordinatorsCommitActor(ReadYourWrite std::string newName; // check update for cluster_description - Key cluster_decription_key = LiteralStringRef("cluster_description").withPrefix(kr.begin); + Key cluster_decription_key = "cluster_description"_sr.withPrefix(kr.begin); auto entry = ryw->getSpecialKeySpaceWriteMap()[cluster_decription_key]; if (entry.first) { // check valid description [a-zA-Z0-9_]+ @@ -1739,11 +1764,15 @@ ACTOR static Future> coordinatorsCommitActor(ReadYourWrite } } + auto configDBEntry = ryw->getSpecialKeySpaceWriteMap()["config_db"_sr.withPrefix(kr.begin)]; + TraceEvent(SevDebug, "SKSChangeCoordinatorsStart") .detail("NewConnectionString", conn.toString()) - .detail("Description", entry.first ? entry.second.get().toString() : ""); + .detail("Description", entry.first ? entry.second.get().toString() : "") + .detail("ConfigDBDisabled", configDBEntry.first); - Optional r = wait(changeQuorumChecker(&ryw->getTransaction(), &conn, newName)); + Optional r = + wait(changeQuorumChecker(&ryw->getTransaction(), &conn, newName, configDBEntry.first)); TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish") .detail("Result", r.present() ? static_cast(r.get()) : -1); // -1 means success @@ -1966,7 +1995,7 @@ Future ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw KeyRef prefix = getKeyRange().begin; RangeResult result = RangeResult(); // client_txn_sample_rate - Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(prefix); + Key sampleRateKey = "client_txn_sample_rate"_sr.withPrefix(prefix); ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS); @@ -1987,7 +2016,7 @@ Future ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw } } // client_txn_size_limit - Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(prefix); + Key txnSizeLimitKey = "client_txn_size_limit"_sr.withPrefix(prefix); if (kr.contains(txnSizeLimitKey)) { auto entry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey]; if (!ryw->readYourWritesDisabled() && entry.first) { @@ -2013,7 +2042,7 @@ Future> ClientProfilingImpl::commit(ReadYourWritesTransact Standalone> clears; // client_txn_sample_rate - Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(getKeyRange().begin); + Key sampleRateKey = "client_txn_sample_rate"_sr.withPrefix(getKeyRange().begin); auto rateEntry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey]; if (rateEntry.first && rateEntry.second.present()) { @@ -2033,7 +2062,7 @@ Future> ClientProfilingImpl::commit(ReadYourWritesTransact } } // client_txn_size_limit - Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(getKeyRange().begin); + Key txnSizeLimitKey = "client_txn_size_limit"_sr.withPrefix(getKeyRange().begin); auto sizeLimitEntry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey]; if (sizeLimitEntry.first && sizeLimitEntry.second.present()) { std::string sizeLimitStr = sizeLimitEntry.second.get().toString(); @@ -2078,11 +2107,11 @@ void parse(StringRef& val, double& d) { } void parse(StringRef& val, WaitState& w) { - if (val == LiteralStringRef("disk") || val == LiteralStringRef("Disk")) { + if (val == "disk"_sr || val == "Disk"_sr) { w = WaitState::Disk; - } else if (val == LiteralStringRef("network") || val == LiteralStringRef("Network")) { + } else if (val == "network"_sr || val == "Network"_sr) { w = WaitState::Network; - } else if (val == LiteralStringRef("running") || val == LiteralStringRef("Running")) { + } else if (val == "running"_sr || val == "Running"_sr) { w = WaitState::Running; } else { throw std::range_error("failed to parse run state"); @@ -2482,7 +2511,7 @@ ACTOR static Future DataDistributionGetRangeActor(ReadYourWritesTra KeyRangeRef kr) { state RangeResult result; // dataDistributionModeKey - state Key modeKey = LiteralStringRef("mode").withPrefix(prefix); + state Key modeKey = "mode"_sr.withPrefix(prefix); ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS); @@ -2498,7 +2527,7 @@ ACTOR static Future DataDistributionGetRangeActor(ReadYourWritesTra } } // rebalanceDDIgnoreKey - state Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(prefix); + state Key rebalanceIgnoredKey = "rebalance_ignored"_sr.withPrefix(prefix); if (kr.contains(rebalanceIgnoredKey)) { auto entry = ryw->getSpecialKeySpaceWriteMap()[rebalanceIgnoredKey]; if (ryw->readYourWritesDisabled() || !entry.first) { @@ -2525,8 +2554,8 @@ Future> DataDistributionImpl::commit(ReadYourWritesTransac Optional msg; KeyRangeRef kr = getKeyRange(); - Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin); - Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin); + Key modeKey = "mode"_sr.withPrefix(kr.begin); + Key rebalanceIgnoredKey = "rebalance_ignored"_sr.withPrefix(kr.begin); auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr); for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { if (!iter->value().first) @@ -2713,11 +2742,11 @@ void ExcludedLocalitiesRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyR Key ExcludedLocalitiesRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key ExcludedLocalitiesRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -2742,11 +2771,11 @@ void FailedLocalitiesRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef Key FailedLocalitiesRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key FailedLocalitiesRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index f912d691bc..a720547b56 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -31,20 +31,20 @@ FDB_DEFINE_BOOLEAN_PARAM(AssignEmptyRange); FDB_DEFINE_BOOLEAN_PARAM(UnassignShard); -const KeyRef systemKeysPrefix = LiteralStringRef("\xff"); +const KeyRef systemKeysPrefix = "\xff"_sr; const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix); -const KeyRangeRef systemKeys(systemKeysPrefix, LiteralStringRef("\xff\xff")); -const KeyRangeRef nonMetadataSystemKeys(LiteralStringRef("\xff\x02"), LiteralStringRef("\xff\x03")); +const KeyRangeRef systemKeys(systemKeysPrefix, "\xff\xff"_sr); +const KeyRangeRef nonMetadataSystemKeys("\xff\x02"_sr, "\xff\x03"_sr); const KeyRangeRef allKeys = KeyRangeRef(normalKeys.begin, systemKeys.end); -const KeyRef afterAllKeys = LiteralStringRef("\xff\xff\x00"); -const KeyRangeRef specialKeys = KeyRangeRef(LiteralStringRef("\xff\xff"), LiteralStringRef("\xff\xff\xff")); +const KeyRef afterAllKeys = "\xff\xff\x00"_sr; +const KeyRangeRef specialKeys = KeyRangeRef("\xff\xff"_sr, "\xff\xff\xff"_sr); // keyServersKeys.contains(k) iff k.startsWith(keyServersPrefix) -const KeyRangeRef keyServersKeys(LiteralStringRef("\xff/keyServers/"), LiteralStringRef("\xff/keyServers0")); +const KeyRangeRef keyServersKeys("\xff/keyServers/"_sr, "\xff/keyServers0"_sr); const KeyRef keyServersPrefix = keyServersKeys.begin; const KeyRef keyServersEnd = keyServersKeys.end; -const KeyRangeRef keyServersKeyServersKeys(LiteralStringRef("\xff/keyServers/\xff/keyServers/"), - LiteralStringRef("\xff/keyServers/\xff/keyServers0")); +const KeyRangeRef keyServersKeyServersKeys("\xff/keyServers/\xff/keyServers/"_sr, + "\xff/keyServers/\xff/keyServers0"_sr); const KeyRef keyServersKeyServersKey = keyServersKeyServersKeys.begin; // These constants are selected to be easily recognized during debugging. @@ -274,20 +274,17 @@ void decodeKeyServersValue(std::map const& tag_uid, } const KeyRangeRef conflictingKeysRange = - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"), - LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff\xff")); -const ValueRef conflictingKeysTrue = LiteralStringRef("1"); -const ValueRef conflictingKeysFalse = LiteralStringRef("0"); + KeyRangeRef("\xff\xff/transaction/conflicting_keys/"_sr, "\xff\xff/transaction/conflicting_keys/\xff\xff"_sr); +const ValueRef conflictingKeysTrue = "1"_sr; +const ValueRef conflictingKeysFalse = "0"_sr; const KeyRangeRef readConflictRangeKeysRange = - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/read_conflict_range/"), - LiteralStringRef("\xff\xff/transaction/read_conflict_range/\xff\xff")); + KeyRangeRef("\xff\xff/transaction/read_conflict_range/"_sr, "\xff\xff/transaction/read_conflict_range/\xff\xff"_sr); -const KeyRangeRef writeConflictRangeKeysRange = - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"), - LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff")); +const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transaction/write_conflict_range/"_sr, + "\xff\xff/transaction/write_conflict_range/\xff\xff"_sr); -const KeyRef clusterIdKey = LiteralStringRef("\xff/clusterId"); +const KeyRef clusterIdKey = "\xff/clusterId"_sr; const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr; @@ -344,7 +341,7 @@ DataMoveMetaData decodeDataMoveValue(const ValueRef& value) { } // "\xff/cacheServer/[[UID]] := StorageServerInterface" -const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"), LiteralStringRef("\xff/cacheServer0")); +const KeyRangeRef storageCacheServerKeys("\xff/cacheServer/"_sr, "\xff/cacheServer0"_sr); const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin; const KeyRef storageCacheServersEnd = storageCacheServerKeys.end; @@ -361,11 +358,11 @@ const Value storageCacheServerValue(const StorageServerInterface& ssi) { return ObjectWriter::toValue(ssi, IncludeVersion(protocolVersion)); } -const KeyRangeRef ddStatsRange = KeyRangeRef(LiteralStringRef("\xff\xff/metrics/data_distribution_stats/"), - LiteralStringRef("\xff\xff/metrics/data_distribution_stats/\xff\xff")); +const KeyRangeRef ddStatsRange = + KeyRangeRef("\xff\xff/metrics/data_distribution_stats/"_sr, "\xff\xff/metrics/data_distribution_stats/\xff\xff"_sr); // "\xff/storageCache/[[begin]]" := "[[vector]]" -const KeyRangeRef storageCacheKeys(LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0")); +const KeyRangeRef storageCacheKeys("\xff/storageCache/"_sr, "\xff/storageCache0"_sr); const KeyRef storageCachePrefix = storageCacheKeys.begin; const Key storageCacheKey(const KeyRef& k) { @@ -427,7 +424,7 @@ const Key serverKeysKey(UID serverID, const KeyRef& key) { BinaryWriter wr(Unversioned()); wr.serializeBytes(serverKeysPrefix); wr << serverID; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); wr.serializeBytes(key); return wr.toValue(); } @@ -435,7 +432,7 @@ const Key serverKeysPrefixFor(UID serverID) { BinaryWriter wr(Unversioned()); wr.serializeBytes(serverKeysPrefix); wr << serverID; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); return wr.toValue(); } UID serverKeysDecodeServer(const KeyRef& key) { @@ -499,13 +496,13 @@ void decodeServerKeysValue(const ValueRef& value, bool& assigned, bool& emptyRan } } -const KeyRef cacheKeysPrefix = LiteralStringRef("\xff\x02/cacheKeys/"); +const KeyRef cacheKeysPrefix = "\xff\x02/cacheKeys/"_sr; const Key cacheKeysKey(uint16_t idx, const KeyRef& key) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cacheKeysPrefix); wr << idx; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); wr.serializeBytes(key); return wr.toValue(); } @@ -513,7 +510,7 @@ const Key cacheKeysPrefixFor(uint16_t idx) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cacheKeysPrefix); wr << idx; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); return wr.toValue(); } uint16_t cacheKeysDecodeIndex(const KeyRef& key) { @@ -526,9 +523,8 @@ KeyRef cacheKeysDecodeKey(const KeyRef& key) { return key.substr(cacheKeysPrefix.size() + sizeof(uint16_t) + 1); } -const KeyRef cacheChangeKey = LiteralStringRef("\xff\x02/cacheChangeKey"); -const KeyRangeRef cacheChangeKeys(LiteralStringRef("\xff\x02/cacheChangeKeys/"), - LiteralStringRef("\xff\x02/cacheChangeKeys0")); +const KeyRef cacheChangeKey = "\xff\x02/cacheChangeKey"_sr; +const KeyRangeRef cacheChangeKeys("\xff\x02/cacheChangeKeys/"_sr, "\xff\x02/cacheChangeKeys0"_sr); const KeyRef cacheChangePrefix = cacheChangeKeys.begin; const Key cacheChangeKeyFor(uint16_t idx) { BinaryWriter wr(Unversioned()); @@ -543,9 +539,9 @@ uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key) { return idx; } -const KeyRangeRef tssMappingKeys(LiteralStringRef("\xff/tss/"), LiteralStringRef("\xff/tss0")); +const KeyRangeRef tssMappingKeys("\xff/tss/"_sr, "\xff/tss0"_sr); -const KeyRangeRef tssQuarantineKeys(LiteralStringRef("\xff/tssQ/"), LiteralStringRef("\xff/tssQ0")); +const KeyRangeRef tssQuarantineKeys("\xff/tssQ/"_sr, "\xff/tssQ0"_sr); const Key tssQuarantineKeyFor(UID serverID) { BinaryWriter wr(Unversioned()); @@ -561,22 +557,19 @@ UID decodeTssQuarantineKey(KeyRef const& key) { return serverID; } -const KeyRangeRef tssMismatchKeys(LiteralStringRef("\xff/tssMismatch/"), LiteralStringRef("\xff/tssMismatch0")); +const KeyRangeRef tssMismatchKeys("\xff/tssMismatch/"_sr, "\xff/tssMismatch0"_sr); -const KeyRangeRef serverMetadataKeys(LiteralStringRef("\xff/serverMetadata/"), - LiteralStringRef("\xff/serverMetadata0")); +const KeyRangeRef serverMetadataKeys("\xff/serverMetadata/"_sr, "\xff/serverMetadata0"_sr); -const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0")); +const KeyRangeRef serverTagKeys("\xff/serverTag/"_sr, "\xff/serverTag0"_sr); const KeyRef serverTagPrefix = serverTagKeys.begin; -const KeyRangeRef serverTagConflictKeys(LiteralStringRef("\xff/serverTagConflict/"), - LiteralStringRef("\xff/serverTagConflict0")); +const KeyRangeRef serverTagConflictKeys("\xff/serverTagConflict/"_sr, "\xff/serverTagConflict0"_sr); const KeyRef serverTagConflictPrefix = serverTagConflictKeys.begin; // serverTagHistoryKeys is the old tag a storage server uses before it is migrated to a different location. // For example, we can copy a SS file to a remote DC and start the SS there; // The new SS will need to consume the last bits of data from the old tag it is responsible for. -const KeyRangeRef serverTagHistoryKeys(LiteralStringRef("\xff/serverTagHistory/"), - LiteralStringRef("\xff/serverTagHistory0")); +const KeyRangeRef serverTagHistoryKeys("\xff/serverTagHistory/"_sr, "\xff/serverTagHistory0"_sr); const KeyRef serverTagHistoryPrefix = serverTagHistoryKeys.begin; const Key serverTagKeyFor(UID serverID) { @@ -661,8 +654,7 @@ const Key serverTagConflictKeyFor(Tag tag) { return wr.toValue(); } -const KeyRangeRef tagLocalityListKeys(LiteralStringRef("\xff/tagLocalityList/"), - LiteralStringRef("\xff/tagLocalityList0")); +const KeyRangeRef tagLocalityListKeys("\xff/tagLocalityList/"_sr, "\xff/tagLocalityList0"_sr); const KeyRef tagLocalityListPrefix = tagLocalityListKeys.begin; const Key tagLocalityListKeyFor(Optional dcID) { @@ -690,8 +682,7 @@ int8_t decodeTagLocalityListValue(ValueRef const& value) { return s; } -const KeyRangeRef datacenterReplicasKeys(LiteralStringRef("\xff\x02/datacenterReplicas/"), - LiteralStringRef("\xff\x02/datacenterReplicas0")); +const KeyRangeRef datacenterReplicasKeys("\xff\x02/datacenterReplicas/"_sr, "\xff\x02/datacenterReplicas0"_sr); const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin; const Key datacenterReplicasKeyFor(Optional dcID) { @@ -724,8 +715,7 @@ extern const KeyRangeRef tLogDatacentersKeys; extern const KeyRef tLogDatacentersPrefix; const Key tLogDatacentersKeyFor(Optional dcID); -const KeyRangeRef tLogDatacentersKeys(LiteralStringRef("\xff\x02/tLogDatacenters/"), - LiteralStringRef("\xff\x02/tLogDatacenters0")); +const KeyRangeRef tLogDatacentersKeys("\xff\x02/tLogDatacenters/"_sr, "\xff\x02/tLogDatacenters0"_sr); const KeyRef tLogDatacentersPrefix = tLogDatacentersKeys.begin; const Key tLogDatacentersKeyFor(Optional dcID) { @@ -741,10 +731,10 @@ Optional decodeTLogDatacentersKey(KeyRef const& key) { return dcID; } -const KeyRef primaryDatacenterKey = LiteralStringRef("\xff/primaryDatacenter"); +const KeyRef primaryDatacenterKey = "\xff/primaryDatacenter"_sr; // serverListKeys.contains(k) iff k.startsWith( serverListKeys.begin ) because '/'+1 == '0' -const KeyRangeRef serverListKeys(LiteralStringRef("\xff/serverList/"), LiteralStringRef("\xff/serverList0")); +const KeyRangeRef serverListKeys("\xff/serverList/"_sr, "\xff/serverList0"_sr); const KeyRef serverListPrefix = serverListKeys.begin; const Key serverListKeyFor(UID serverID) { @@ -800,11 +790,11 @@ SWVersion decodeSWVersionValue(ValueRef const& value) { } // processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0' -const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0")); +const KeyRangeRef processClassKeys("\xff/processClass/"_sr, "\xff/processClass0"_sr); const KeyRef processClassPrefix = processClassKeys.begin; -const KeyRef processClassChangeKey = LiteralStringRef("\xff/processClassChanges"); -const KeyRef processClassVersionKey = LiteralStringRef("\xff/processClassChangesVersion"); -const ValueRef processClassVersionValue = LiteralStringRef("1"); +const KeyRef processClassChangeKey = "\xff/processClassChanges"_sr; +const KeyRef processClassVersionKey = "\xff/processClassChangesVersion"_sr; +const ValueRef processClassVersionValue = "1"_sr; const Key processClassKeyFor(StringRef processID) { BinaryWriter wr(Unversioned()); @@ -840,21 +830,23 @@ ProcessClass decodeProcessClassValue(ValueRef const& value) { return s; } -const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\xff/conf0")); +const KeyRangeRef configKeys("\xff/conf/"_sr, "\xff/conf0"_sr); const KeyRef configKeysPrefix = configKeys.begin; -const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle")); -const KeyRef perpetualStorageWiggleLocalityKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle_locality")); -const KeyRef perpetualStorageWiggleIDPrefix( - LiteralStringRef("\xff/storageWiggleID/")); // withSuffix /primary or /remote -const KeyRef perpetualStorageWiggleStatsPrefix( - LiteralStringRef("\xff/storageWiggleStats/")); // withSuffix /primary or /remote +const KeyRef perpetualStorageWiggleKey("\xff/conf/perpetual_storage_wiggle"_sr); +const KeyRef perpetualStorageWiggleLocalityKey("\xff/conf/perpetual_storage_wiggle_locality"_sr); +const KeyRef perpetualStorageWiggleIDPrefix("\xff/storageWiggleID/"_sr); // withSuffix /primary or /remote +const KeyRef perpetualStorageWiggleStatsPrefix("\xff/storageWiggleStats/"_sr); // withSuffix /primary or /remote -const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint")); +const KeyRef triggerDDTeamInfoPrintKey("\xff/triggerDDTeamInfoPrint"_sr); -const KeyRangeRef excludedServersKeys(LiteralStringRef("\xff/conf/excluded/"), LiteralStringRef("\xff/conf/excluded0")); +const KeyRef consistencyScanInfoKey = "\xff/consistencyScanInfo"_sr; + +const KeyRef encryptionAtRestModeConfKey("\xff/conf/encryption_at_rest_mode"_sr); + +const KeyRangeRef excludedServersKeys("\xff/conf/excluded/"_sr, "\xff/conf/excluded0"_sr); const KeyRef excludedServersPrefix = excludedServersKeys.begin; -const KeyRef excludedServersVersionKey = LiteralStringRef("\xff/conf/excluded"); +const KeyRef excludedServersVersionKey = "\xff/conf/excluded"_sr; AddressExclusion decodeExcludedServersKey(KeyRef const& key) { ASSERT(key.startsWith(excludedServersPrefix)); // Returns an invalid NetworkAddress if given an invalid key (within the prefix) @@ -869,10 +861,9 @@ std::string encodeExcludedServersKey(AddressExclusion const& addr) { return excludedServersPrefix.toString() + addr.toString(); } -const KeyRangeRef excludedLocalityKeys(LiteralStringRef("\xff/conf/excluded_locality/"), - LiteralStringRef("\xff/conf/excluded_locality0")); +const KeyRangeRef excludedLocalityKeys("\xff/conf/excluded_locality/"_sr, "\xff/conf/excluded_locality0"_sr); const KeyRef excludedLocalityPrefix = excludedLocalityKeys.begin; -const KeyRef excludedLocalityVersionKey = LiteralStringRef("\xff/conf/excluded_locality"); +const KeyRef excludedLocalityVersionKey = "\xff/conf/excluded_locality"_sr; std::string decodeExcludedLocalityKey(KeyRef const& key) { ASSERT(key.startsWith(excludedLocalityPrefix)); return key.removePrefix(excludedLocalityPrefix).toString(); @@ -881,9 +872,9 @@ std::string encodeExcludedLocalityKey(std::string const& locality) { return excludedLocalityPrefix.toString() + locality; } -const KeyRangeRef failedServersKeys(LiteralStringRef("\xff/conf/failed/"), LiteralStringRef("\xff/conf/failed0")); +const KeyRangeRef failedServersKeys("\xff/conf/failed/"_sr, "\xff/conf/failed0"_sr); const KeyRef failedServersPrefix = failedServersKeys.begin; -const KeyRef failedServersVersionKey = LiteralStringRef("\xff/conf/failed"); +const KeyRef failedServersVersionKey = "\xff/conf/failed"_sr; AddressExclusion decodeFailedServersKey(KeyRef const& key) { ASSERT(key.startsWith(failedServersPrefix)); // Returns an invalid NetworkAddress if given an invalid key (within the prefix) @@ -898,10 +889,9 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) { return failedServersPrefix.toString() + addr.toString(); } -const KeyRangeRef failedLocalityKeys(LiteralStringRef("\xff/conf/failed_locality/"), - LiteralStringRef("\xff/conf/failed_locality0")); +const KeyRangeRef failedLocalityKeys("\xff/conf/failed_locality/"_sr, "\xff/conf/failed_locality0"_sr); const KeyRef failedLocalityPrefix = failedLocalityKeys.begin; -const KeyRef failedLocalityVersionKey = LiteralStringRef("\xff/conf/failed_locality"); +const KeyRef failedLocalityVersionKey = "\xff/conf/failed_locality"_sr; std::string decodeFailedLocalityKey(KeyRef const& key) { ASSERT(key.startsWith(failedLocalityPrefix)); return key.removePrefix(failedLocalityPrefix).toString(); @@ -910,20 +900,18 @@ std::string encodeFailedLocalityKey(std::string const& locality) { return failedLocalityPrefix.toString() + locality; } -// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") ); +// const KeyRangeRef globalConfigKeys( "\xff/globalConfig/"_sr, "\xff/globalConfig0"_sr ); // const KeyRef globalConfigPrefix = globalConfigKeys.begin; -const KeyRangeRef globalConfigDataKeys(LiteralStringRef("\xff/globalConfig/k/"), - LiteralStringRef("\xff/globalConfig/k0")); +const KeyRangeRef globalConfigDataKeys("\xff/globalConfig/k/"_sr, "\xff/globalConfig/k0"_sr); const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin; -const KeyRangeRef globalConfigHistoryKeys(LiteralStringRef("\xff/globalConfig/h/"), - LiteralStringRef("\xff/globalConfig/h0")); +const KeyRangeRef globalConfigHistoryKeys("\xff/globalConfig/h/"_sr, "\xff/globalConfig/h0"_sr); const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin; -const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v"); +const KeyRef globalConfigVersionKey = "\xff/globalConfig/v"_sr; -const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0")); +const KeyRangeRef workerListKeys("\xff/worker/"_sr, "\xff/worker0"_sr); const KeyRef workerListPrefix = workerListKeys.begin; const Key workerListKeyFor(StringRef processID) { @@ -953,11 +941,10 @@ ProcessData decodeWorkerListValue(ValueRef const& value) { return s; } -const KeyRangeRef backupProgressKeys(LiteralStringRef("\xff\x02/backupProgress/"), - LiteralStringRef("\xff\x02/backupProgress0")); +const KeyRangeRef backupProgressKeys("\xff\x02/backupProgress/"_sr, "\xff\x02/backupProgress0"_sr); const KeyRef backupProgressPrefix = backupProgressKeys.begin; -const KeyRef backupStartedKey = LiteralStringRef("\xff\x02/backupStarted"); -extern const KeyRef backupPausedKey = LiteralStringRef("\xff\x02/backupPaused"); +const KeyRef backupStartedKey = "\xff\x02/backupStarted"_sr; +extern const KeyRef backupPausedKey = "\xff\x02/backupPaused"_sr; const Key backupProgressKeyFor(UID workerID) { BinaryWriter wr(Unversioned()); @@ -1000,98 +987,91 @@ std::vector> decodeBackupStartedValue(const ValueRef& va return ids; } -const KeyRef coordinatorsKey = LiteralStringRef("\xff/coordinators"); -const KeyRef logsKey = LiteralStringRef("\xff/logs"); -const KeyRef minRequiredCommitVersionKey = LiteralStringRef("\xff/minRequiredCommitVersion"); -const KeyRef versionEpochKey = LiteralStringRef("\xff/versionEpoch"); +const KeyRef previousCoordinatorsKey = "\xff/previousCoordinators"_sr; +const KeyRef coordinatorsKey = "\xff/coordinators"_sr; +const KeyRef logsKey = "\xff/logs"_sr; +const KeyRef minRequiredCommitVersionKey = "\xff/minRequiredCommitVersion"_sr; +const KeyRef versionEpochKey = "\xff/versionEpoch"_sr; -const KeyRef globalKeysPrefix = LiteralStringRef("\xff/globals"); -const KeyRef lastEpochEndKey = LiteralStringRef("\xff/globals/lastEpochEnd"); -const KeyRef lastEpochEndPrivateKey = LiteralStringRef("\xff\xff/globals/lastEpochEnd"); -const KeyRef killStorageKey = LiteralStringRef("\xff/globals/killStorage"); -const KeyRef killStoragePrivateKey = LiteralStringRef("\xff\xff/globals/killStorage"); -const KeyRef rebootWhenDurableKey = LiteralStringRef("\xff/globals/rebootWhenDurable"); -const KeyRef rebootWhenDurablePrivateKey = LiteralStringRef("\xff\xff/globals/rebootWhenDurable"); -const KeyRef primaryLocalityKey = LiteralStringRef("\xff/globals/primaryLocality"); -const KeyRef primaryLocalityPrivateKey = LiteralStringRef("\xff\xff/globals/primaryLocality"); -const KeyRef fastLoggingEnabled = LiteralStringRef("\xff/globals/fastLoggingEnabled"); -const KeyRef fastLoggingEnabledPrivateKey = LiteralStringRef("\xff\xff/globals/fastLoggingEnabled"); +const KeyRef globalKeysPrefix = "\xff/globals"_sr; +const KeyRef lastEpochEndKey = "\xff/globals/lastEpochEnd"_sr; +const KeyRef lastEpochEndPrivateKey = "\xff\xff/globals/lastEpochEnd"_sr; +const KeyRef killStorageKey = "\xff/globals/killStorage"_sr; +const KeyRef killStoragePrivateKey = "\xff\xff/globals/killStorage"_sr; +const KeyRef rebootWhenDurableKey = "\xff/globals/rebootWhenDurable"_sr; +const KeyRef rebootWhenDurablePrivateKey = "\xff\xff/globals/rebootWhenDurable"_sr; +const KeyRef primaryLocalityKey = "\xff/globals/primaryLocality"_sr; +const KeyRef primaryLocalityPrivateKey = "\xff\xff/globals/primaryLocality"_sr; +const KeyRef fastLoggingEnabled = "\xff/globals/fastLoggingEnabled"_sr; +const KeyRef fastLoggingEnabledPrivateKey = "\xff\xff/globals/fastLoggingEnabled"_sr; // Whenever configuration changes or DD related system keyspace is changed(e.g.., serverList), // actor must grab the moveKeysLockOwnerKey and update moveKeysLockWriteKey. // This prevents concurrent write to the same system keyspace. // When the owner of the DD related system keyspace changes, DD will reboot -const KeyRef moveKeysLockOwnerKey = LiteralStringRef("\xff/moveKeysLock/Owner"); -const KeyRef moveKeysLockWriteKey = LiteralStringRef("\xff/moveKeysLock/Write"); +const KeyRef moveKeysLockOwnerKey = "\xff/moveKeysLock/Owner"_sr; +const KeyRef moveKeysLockWriteKey = "\xff/moveKeysLock/Write"_sr; -const KeyRef dataDistributionModeKey = LiteralStringRef("\xff/dataDistributionMode"); +const KeyRef dataDistributionModeKey = "\xff/dataDistributionMode"_sr; const UID dataDistributionModeLock = UID(6345, 3425); // Keys to view and control tag throttling -const KeyRangeRef tagThrottleKeys = - KeyRangeRef(LiteralStringRef("\xff\x02/throttledTags/tag/"), LiteralStringRef("\xff\x02/throttledTags/tag0")); +const KeyRangeRef tagThrottleKeys = KeyRangeRef("\xff\x02/throttledTags/tag/"_sr, "\xff\x02/throttledTags/tag0"_sr); const KeyRef tagThrottleKeysPrefix = tagThrottleKeys.begin; -const KeyRef tagThrottleAutoKeysPrefix = LiteralStringRef("\xff\x02/throttledTags/tag/\x01"); -const KeyRef tagThrottleSignalKey = LiteralStringRef("\xff\x02/throttledTags/signal"); -const KeyRef tagThrottleAutoEnabledKey = LiteralStringRef("\xff\x02/throttledTags/autoThrottlingEnabled"); -const KeyRef tagThrottleLimitKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleLimit"); -const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleCount"); +const KeyRef tagThrottleAutoKeysPrefix = "\xff\x02/throttledTags/tag/\x01"_sr; +const KeyRef tagThrottleSignalKey = "\xff\x02/throttledTags/signal"_sr; +const KeyRef tagThrottleAutoEnabledKey = "\xff\x02/throttledTags/autoThrottlingEnabled"_sr; +const KeyRef tagThrottleLimitKey = "\xff\x02/throttledTags/manualThrottleLimit"_sr; +const KeyRef tagThrottleCountKey = "\xff\x02/throttledTags/manualThrottleCount"_sr; // Client status info prefix -const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"), - LiteralStringRef("\xff\x02/fdbClientInfo0")); +const KeyRangeRef fdbClientInfoPrefixRange("\xff\x02/fdbClientInfo/"_sr, "\xff\x02/fdbClientInfo0"_sr); // See remaining fields in GlobalConfig.actor.h // ConsistencyCheck settings -const KeyRef fdbShouldConsistencyCheckBeSuspended = LiteralStringRef("\xff\x02/ConsistencyCheck/Suspend"); +const KeyRef fdbShouldConsistencyCheckBeSuspended = "\xff\x02/ConsistencyCheck/Suspend"_sr; // Request latency measurement key -const KeyRef latencyBandConfigKey = LiteralStringRef("\xff\x02/latencyBandConfig"); +const KeyRef latencyBandConfigKey = "\xff\x02/latencyBandConfig"_sr; // Keyspace to maintain wall clock to version map -const KeyRangeRef timeKeeperPrefixRange(LiteralStringRef("\xff\x02/timeKeeper/map/"), - LiteralStringRef("\xff\x02/timeKeeper/map0")); -const KeyRef timeKeeperVersionKey = LiteralStringRef("\xff\x02/timeKeeper/version"); -const KeyRef timeKeeperDisableKey = LiteralStringRef("\xff\x02/timeKeeper/disable"); +const KeyRangeRef timeKeeperPrefixRange("\xff\x02/timeKeeper/map/"_sr, "\xff\x02/timeKeeper/map0"_sr); +const KeyRef timeKeeperVersionKey = "\xff\x02/timeKeeper/version"_sr; +const KeyRef timeKeeperDisableKey = "\xff\x02/timeKeeper/disable"_sr; // Backup Log Mutation constant variables -const KeyRef backupEnabledKey = LiteralStringRef("\xff/backupEnabled"); -const KeyRangeRef backupLogKeys(LiteralStringRef("\xff\x02/blog/"), LiteralStringRef("\xff\x02/blog0")); -const KeyRangeRef applyLogKeys(LiteralStringRef("\xff\x02/alog/"), LiteralStringRef("\xff\x02/alog0")); +const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr; +const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr); +const KeyRangeRef applyLogKeys("\xff\x02/alog/"_sr, "\xff\x02/alog0"_sr); // static_assert( backupLogKeys.begin.size() == backupLogPrefixBytes, "backupLogPrefixBytes incorrect" ); -const KeyRef backupVersionKey = LiteralStringRef("\xff/backupDataFormat"); -const ValueRef backupVersionValue = LiteralStringRef("4"); +const KeyRef backupVersionKey = "\xff/backupDataFormat"_sr; +const ValueRef backupVersionValue = "4"_sr; const int backupVersion = 4; // Log Range constant variables // \xff/logRanges/[16-byte UID][begin key] := serialize( make_pair([end key], [destination key prefix]), // IncludeVersion() ) -const KeyRangeRef logRangesRange(LiteralStringRef("\xff/logRanges/"), LiteralStringRef("\xff/logRanges0")); +const KeyRangeRef logRangesRange("\xff/logRanges/"_sr, "\xff/logRanges0"_sr); // Layer status metadata prefix -const KeyRangeRef layerStatusMetaPrefixRange(LiteralStringRef("\xff\x02/status/"), - LiteralStringRef("\xff\x02/status0")); +const KeyRangeRef layerStatusMetaPrefixRange("\xff\x02/status/"_sr, "\xff\x02/status0"_sr); // Backup agent status root -const KeyRangeRef backupStatusPrefixRange(LiteralStringRef("\xff\x02/backupstatus/"), - LiteralStringRef("\xff\x02/backupstatus0")); +const KeyRangeRef backupStatusPrefixRange("\xff\x02/backupstatus/"_sr, "\xff\x02/backupstatus0"_sr); // Restore configuration constant variables -const KeyRangeRef fileRestorePrefixRange(LiteralStringRef("\xff\x02/restore-agent/"), - LiteralStringRef("\xff\x02/restore-agent0")); +const KeyRangeRef fileRestorePrefixRange("\xff\x02/restore-agent/"_sr, "\xff\x02/restore-agent0"_sr); // Backup Agent configuration constant variables -const KeyRangeRef fileBackupPrefixRange(LiteralStringRef("\xff\x02/backup-agent/"), - LiteralStringRef("\xff\x02/backup-agent0")); +const KeyRangeRef fileBackupPrefixRange("\xff\x02/backup-agent/"_sr, "\xff\x02/backup-agent0"_sr); // DR Agent configuration constant variables -const KeyRangeRef databaseBackupPrefixRange(LiteralStringRef("\xff\x02/db-backup-agent/"), - LiteralStringRef("\xff\x02/db-backup-agent0")); +const KeyRangeRef databaseBackupPrefixRange("\xff\x02/db-backup-agent/"_sr, "\xff\x02/db-backup-agent0"_sr); // \xff\x02/sharedLogRangesConfig/destUidLookup/[keyRange] -const KeyRef destUidLookupPrefix = LiteralStringRef("\xff\x02/sharedLogRangesConfig/destUidLookup/"); +const KeyRef destUidLookupPrefix = "\xff\x02/sharedLogRangesConfig/destUidLookup/"_sr; // \xff\x02/sharedLogRangesConfig/backuplatestVersions/[destUid]/[logUid] -const KeyRef backupLatestVersionsPrefix = LiteralStringRef("\xff\x02/sharedLogRangesConfig/backupLatestVersions/"); +const KeyRef backupLatestVersionsPrefix = "\xff\x02/sharedLogRangesConfig/backupLatestVersions/"_sr; // Returns the encoded key comprised of begin key and log uid Key logRangesEncodeKey(KeyRef keyBegin, UID logUid) { @@ -1146,31 +1126,27 @@ Key uidPrefixKey(KeyRef keyPrefix, UID logUid) { // Apply mutations constant variables // \xff/applyMutationsEnd/[16-byte UID] := serialize( endVersion, Unversioned() ) // This indicates what is the highest version the mutation log can be applied -const KeyRangeRef applyMutationsEndRange(LiteralStringRef("\xff/applyMutationsEnd/"), - LiteralStringRef("\xff/applyMutationsEnd0")); +const KeyRangeRef applyMutationsEndRange("\xff/applyMutationsEnd/"_sr, "\xff/applyMutationsEnd0"_sr); // \xff/applyMutationsBegin/[16-byte UID] := serialize( beginVersion, Unversioned() ) -const KeyRangeRef applyMutationsBeginRange(LiteralStringRef("\xff/applyMutationsBegin/"), - LiteralStringRef("\xff/applyMutationsBegin0")); +const KeyRangeRef applyMutationsBeginRange("\xff/applyMutationsBegin/"_sr, "\xff/applyMutationsBegin0"_sr); // \xff/applyMutationsAddPrefix/[16-byte UID] := addPrefix -const KeyRangeRef applyMutationsAddPrefixRange(LiteralStringRef("\xff/applyMutationsAddPrefix/"), - LiteralStringRef("\xff/applyMutationsAddPrefix0")); +const KeyRangeRef applyMutationsAddPrefixRange("\xff/applyMutationsAddPrefix/"_sr, "\xff/applyMutationsAddPrefix0"_sr); // \xff/applyMutationsRemovePrefix/[16-byte UID] := removePrefix -const KeyRangeRef applyMutationsRemovePrefixRange(LiteralStringRef("\xff/applyMutationsRemovePrefix/"), - LiteralStringRef("\xff/applyMutationsRemovePrefix0")); +const KeyRangeRef applyMutationsRemovePrefixRange("\xff/applyMutationsRemovePrefix/"_sr, + "\xff/applyMutationsRemovePrefix0"_sr); -const KeyRangeRef applyMutationsKeyVersionMapRange(LiteralStringRef("\xff/applyMutationsKeyVersionMap/"), - LiteralStringRef("\xff/applyMutationsKeyVersionMap0")); -const KeyRangeRef applyMutationsKeyVersionCountRange(LiteralStringRef("\xff\x02/applyMutationsKeyVersionCount/"), - LiteralStringRef("\xff\x02/applyMutationsKeyVersionCount0")); +const KeyRangeRef applyMutationsKeyVersionMapRange("\xff/applyMutationsKeyVersionMap/"_sr, + "\xff/applyMutationsKeyVersionMap0"_sr); +const KeyRangeRef applyMutationsKeyVersionCountRange("\xff\x02/applyMutationsKeyVersionCount/"_sr, + "\xff\x02/applyMutationsKeyVersionCount0"_sr); -const KeyRef systemTuplesPrefix = LiteralStringRef("\xff/a/"); -const KeyRef metricConfChangeKey = LiteralStringRef("\x01TDMetricConfChanges\x00"); +const KeyRef systemTuplesPrefix = "\xff/a/"_sr; +const KeyRef metricConfChangeKey = "\x01TDMetricConfChanges\x00"_sr; -const KeyRangeRef metricConfKeys(LiteralStringRef("\x01TDMetricConf\x00\x01"), - LiteralStringRef("\x01TDMetricConf\x00\x02")); +const KeyRangeRef metricConfKeys("\x01TDMetricConf\x00\x01"_sr, "\x01TDMetricConf\x00\x02"_sr); const KeyRef metricConfPrefix = metricConfKeys.begin; /* @@ -1179,15 +1155,15 @@ const Key metricConfKey( KeyRef const& prefix, MetricNameRef const& name, KeyRef wr.serializeBytes( prefix ); wr.serializeBytes( metricConfPrefix ); wr.serializeBytes( name.type ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( name.name ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( name.address ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( name.id ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( key ); - wr.serializeBytes( LiteralStringRef("\x00") ); + wr.serializeBytes( "\x00"_sr ); return wr.toValue(); } @@ -1210,23 +1186,22 @@ std::pair decodeMetricConfKey( KeyRef const& prefix, KeyR } */ -const KeyRef maxUIDKey = LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"); +const KeyRef maxUIDKey = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"_sr; -const KeyRef databaseLockedKey = LiteralStringRef("\xff/dbLocked"); -const KeyRef databaseLockedKeyEnd = LiteralStringRef("\xff/dbLocked\x00"); -const KeyRef metadataVersionKey = LiteralStringRef("\xff/metadataVersion"); -const KeyRef metadataVersionKeyEnd = LiteralStringRef("\xff/metadataVersion\x00"); -const KeyRef metadataVersionRequiredValue = - LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"); -const KeyRef mustContainSystemMutationsKey = LiteralStringRef("\xff/mustContainSystemMutations"); +const KeyRef databaseLockedKey = "\xff/dbLocked"_sr; +const KeyRef databaseLockedKeyEnd = "\xff/dbLocked\x00"_sr; +const KeyRef metadataVersionKey = "\xff/metadataVersion"_sr; +const KeyRef metadataVersionKeyEnd = "\xff/metadataVersion\x00"_sr; +const KeyRef metadataVersionRequiredValue = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"_sr; +const KeyRef mustContainSystemMutationsKey = "\xff/mustContainSystemMutations"_sr; -const KeyRangeRef monitorConfKeys(LiteralStringRef("\xff\x02/monitorConf/"), LiteralStringRef("\xff\x02/monitorConf0")); +const KeyRangeRef monitorConfKeys("\xff\x02/monitorConf/"_sr, "\xff\x02/monitorConf0"_sr); -const KeyRef restoreRequestDoneKey = LiteralStringRef("\xff\x02/restoreRequestDone"); +const KeyRef restoreRequestDoneKey = "\xff\x02/restoreRequestDone"_sr; -const KeyRef healthyZoneKey = LiteralStringRef("\xff\x02/healthyZone"); -const StringRef ignoreSSFailuresZoneString = LiteralStringRef("IgnoreSSFailures"); -const KeyRef rebalanceDDIgnoreKey = LiteralStringRef("\xff\x02/rebalanceDDIgnored"); +const KeyRef healthyZoneKey = "\xff\x02/healthyZone"_sr; +const StringRef ignoreSSFailuresZoneString = "IgnoreSSFailures"_sr; +const KeyRef rebalanceDDIgnoreKey = "\xff\x02/rebalanceDDIgnored"_sr; const Value healthyZoneValue(StringRef const& zoneId, Version version) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withHealthyZoneValue())); @@ -1243,16 +1218,15 @@ std::pair decodeHealthyZoneValue(ValueRef const& value) { return std::make_pair(zoneId, version); } -const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTONLYtxnStateStore/"), - LiteralStringRef("\xff/TESTONLYtxnStateStore0")); +const KeyRangeRef testOnlyTxnStateStorePrefixRange("\xff/TESTONLYtxnStateStore/"_sr, "\xff/TESTONLYtxnStateStore0"_sr); -const KeyRef writeRecoveryKey = LiteralStringRef("\xff/writeRecovery"); -const ValueRef writeRecoveryKeyTrue = LiteralStringRef("1"); -const KeyRef snapshotEndVersionKey = LiteralStringRef("\xff/snapshotEndVersion"); +const KeyRef writeRecoveryKey = "\xff/writeRecovery"_sr; +const ValueRef writeRecoveryKeyTrue = "1"_sr; +const KeyRef snapshotEndVersionKey = "\xff/snapshotEndVersion"_sr; -const KeyRangeRef changeFeedKeys(LiteralStringRef("\xff\x02/feed/"), LiteralStringRef("\xff\x02/feed0")); +const KeyRangeRef changeFeedKeys("\xff\x02/feed/"_sr, "\xff\x02/feed0"_sr); const KeyRef changeFeedPrefix = changeFeedKeys.begin; -const KeyRef changeFeedPrivatePrefix = LiteralStringRef("\xff\xff\x02/feed/"); +const KeyRef changeFeedPrivatePrefix = "\xff\xff\x02/feed/"_sr; const Value changeFeedValue(KeyRangeRef const& range, Version popVersion, ChangeFeedStatus status) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withChangeFeed())); @@ -1273,7 +1247,7 @@ std::tuple decodeChangeFeedValue(ValueRef c return std::make_tuple(range, version, status); } -const KeyRangeRef changeFeedDurableKeys(LiteralStringRef("\xff\xff/cf/"), LiteralStringRef("\xff\xff/cf0")); +const KeyRangeRef changeFeedDurableKeys("\xff\xff/cf/"_sr, "\xff\xff/cf0"_sr); const KeyRef changeFeedDurablePrefix = changeFeedDurableKeys.begin; const Value changeFeedDurableKey(Key const& feed, Version version) { @@ -1313,9 +1287,9 @@ const KeyRangeRef configClassKeys("\xff\xff/configClasses/"_sr, "\xff\xff/config // key to watch for changes in active blob ranges + KeyRangeMap of active blob ranges // Blob Manager + Worker stuff is all \xff\x02 to avoid Transaction State Store -const KeyRef blobRangeChangeKey = LiteralStringRef("\xff\x02/blobRangeChange"); -const KeyRangeRef blobRangeKeys(LiteralStringRef("\xff\x02/blobRange/"), LiteralStringRef("\xff\x02/blobRange0")); -const KeyRef blobManagerEpochKey = LiteralStringRef("\xff\x02/blobManagerEpoch"); +const KeyRef blobRangeChangeKey = "\xff\x02/blobRangeChange"_sr; +const KeyRangeRef blobRangeKeys("\xff\x02/blobRange/"_sr, "\xff\x02/blobRange0"_sr); +const KeyRef blobManagerEpochKey = "\xff\x02/blobManagerEpoch"_sr; const Value blobManagerEpochValueFor(int64_t epoch) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule())); @@ -1331,21 +1305,19 @@ int64_t decodeBlobManagerEpochValue(ValueRef const& value) { } // blob granule data -const KeyRef blobRangeActive = LiteralStringRef("1"); -const KeyRef blobRangeInactive = LiteralStringRef("0"); +const KeyRef blobRangeActive = "1"_sr; +const KeyRef blobRangeInactive = StringRef(); -const KeyRangeRef blobGranuleFileKeys(LiteralStringRef("\xff\x02/bgf/"), LiteralStringRef("\xff\x02/bgf0")); -const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), LiteralStringRef("\xff\x02/bgm0")); -const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0")); -const KeyRangeRef blobGranuleSplitKeys(LiteralStringRef("\xff\x02/bgs/"), LiteralStringRef("\xff\x02/bgs0")); -const KeyRangeRef blobGranuleMergeKeys(LiteralStringRef("\xff\x02/bgmerge/"), LiteralStringRef("\xff\x02/bgmerge0")); -const KeyRangeRef blobGranuleMergeBoundaryKeys(LiteralStringRef("\xff\x02/bgmergebounds/"), - LiteralStringRef("\xff\x02/bgmergebounds0")); -const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0")); -const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0")); -const KeyRangeRef blobGranuleForcePurgedKeys(LiteralStringRef("\xff\x02/bgpforce/"), - LiteralStringRef("\xff\x02/bgpforce0")); -const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange"); +const KeyRangeRef blobGranuleFileKeys("\xff\x02/bgf/"_sr, "\xff\x02/bgf0"_sr); +const KeyRangeRef blobGranuleMappingKeys("\xff\x02/bgm/"_sr, "\xff\x02/bgm0"_sr); +const KeyRangeRef blobGranuleLockKeys("\xff\x02/bgl/"_sr, "\xff\x02/bgl0"_sr); +const KeyRangeRef blobGranuleSplitKeys("\xff\x02/bgs/"_sr, "\xff\x02/bgs0"_sr); +const KeyRangeRef blobGranuleMergeKeys("\xff\x02/bgmerge/"_sr, "\xff\x02/bgmerge0"_sr); +const KeyRangeRef blobGranuleMergeBoundaryKeys("\xff\x02/bgmergebounds/"_sr, "\xff\x02/bgmergebounds0"_sr); +const KeyRangeRef blobGranuleHistoryKeys("\xff\x02/bgh/"_sr, "\xff\x02/bgh0"_sr); +const KeyRangeRef blobGranulePurgeKeys("\xff\x02/bgp/"_sr, "\xff\x02/bgp0"_sr); +const KeyRangeRef blobGranuleForcePurgedKeys("\xff\x02/bgpforce/"_sr, "\xff\x02/bgpforce0"_sr); +const KeyRef blobGranulePurgeChangeKey = "\xff\x02/bgpChange"_sr; const uint8_t BG_FILE_TYPE_DELTA = 'D'; const uint8_t BG_FILE_TYPE_SNAPSHOT = 'S'; @@ -1623,7 +1595,7 @@ Standalone decodeBlobGranuleHistoryValue(const ValueRef return historyValue; } -const KeyRangeRef blobWorkerListKeys(LiteralStringRef("\xff\x02/bwList/"), LiteralStringRef("\xff\x02/bwList0")); +const KeyRangeRef blobWorkerListKeys("\xff\x02/bwList/"_sr, "\xff\x02/bwList0"_sr); const Key blobWorkerListKeyFor(UID workerID) { BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule())); @@ -1650,7 +1622,7 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) { return interf; } -const KeyRangeRef storageQuotaKeys(LiteralStringRef("\xff/storageQuota/"), LiteralStringRef("\xff/storageQuota0")); +const KeyRangeRef storageQuotaKeys("\xff/storageQuota/"_sr, "\xff/storageQuota0"_sr); const KeyRef storageQuotaPrefix = storageQuotaKeys.begin; Key storageQuotaKey(StringRef tenantName) { diff --git a/fdbclient/TagThrottle.actor.cpp b/fdbclient/TagThrottle.actor.cpp index 856c111dea..7a1712c4df 100644 --- a/fdbclient/TagThrottle.actor.cpp +++ b/fdbclient/TagThrottle.actor.cpp @@ -136,12 +136,11 @@ Key ThrottleApi::getTagQuotaKey(TransactionTagRef tag) { } bool ThrottleApi::TagQuotaValue::isValid() const { - return reservedReadQuota <= totalReadQuota && reservedWriteQuota <= totalWriteQuota && reservedReadQuota >= 0 && - reservedWriteQuota >= 0; + return reservedQuota <= totalQuota && reservedQuota >= 0; } Value ThrottleApi::TagQuotaValue::toValue() const { - return Tuple::makeTuple(reservedReadQuota, totalReadQuota, reservedWriteQuota, totalWriteQuota).pack(); + return Tuple::makeTuple(reservedQuota, totalQuota).pack(); } ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) { @@ -151,20 +150,16 @@ ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) } TagQuotaValue result; try { - result.reservedReadQuota = tuple.getDouble(0); - result.totalReadQuota = tuple.getDouble(1); - result.reservedWriteQuota = tuple.getDouble(2); - result.totalWriteQuota = tuple.getDouble(3); + result.reservedQuota = tuple.getDouble(0); + result.totalQuota = tuple.getDouble(1); } catch (Error& e) { TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e); throw invalid_throttle_quota_value(); } if (!result.isValid()) { TraceEvent(SevWarnAlways, "TagQuotaValueInvalidQuotas") - .detail("ReservedReadQuota", result.reservedReadQuota) - .detail("TotalReadQuota", result.totalReadQuota) - .detail("ReservedWriteQuota", result.reservedWriteQuota) - .detail("TotalWriteQuota", result.totalWriteQuota); + .detail("ReservedQuota", result.reservedQuota) + .detail("TotalQuota", result.totalQuota); throw invalid_throttle_quota_value(); } return result; diff --git a/fdbclient/TaskBucket.actor.cpp b/fdbclient/TaskBucket.actor.cpp index d5bab71971..2e72b301c0 100644 --- a/fdbclient/TaskBucket.actor.cpp +++ b/fdbclient/TaskBucket.actor.cpp @@ -66,7 +66,7 @@ struct UnblockFutureTaskFunc : TaskFuncBase { return Void(); } }; -StringRef UnblockFutureTaskFunc::name = LiteralStringRef("UnblockFuture"); +StringRef UnblockFutureTaskFunc::name = "UnblockFuture"_sr; REGISTER_TASKFUNC(UnblockFutureTaskFunc); struct AddTaskFunc : TaskFuncBase { @@ -88,7 +88,7 @@ struct AddTaskFunc : TaskFuncBase { return Void(); }; }; -StringRef AddTaskFunc::name = LiteralStringRef("AddTask"); +StringRef AddTaskFunc::name = "AddTask"_sr; REGISTER_TASKFUNC(AddTaskFunc); struct IdleTaskFunc : TaskFuncBase { @@ -109,18 +109,18 @@ struct IdleTaskFunc : TaskFuncBase { return tb->finish(tr, task); }; }; -StringRef IdleTaskFunc::name = LiteralStringRef("idle"); +StringRef IdleTaskFunc::name = "idle"_sr; REGISTER_TASKFUNC(IdleTaskFunc); -Key Task::reservedTaskParamKeyType = LiteralStringRef("type"); -Key Task::reservedTaskParamKeyAddTask = LiteralStringRef("_add_task"); -Key Task::reservedTaskParamKeyDone = LiteralStringRef("done"); -Key Task::reservedTaskParamKeyPriority = LiteralStringRef("priority"); -Key Task::reservedTaskParamKeyFuture = LiteralStringRef("future"); -Key Task::reservedTaskParamKeyBlockID = LiteralStringRef("blockid"); -Key Task::reservedTaskParamKeyVersion = LiteralStringRef("version"); -Key Task::reservedTaskParamValidKey = LiteralStringRef("_validkey"); -Key Task::reservedTaskParamValidValue = LiteralStringRef("_validvalue"); +Key Task::reservedTaskParamKeyType = "type"_sr; +Key Task::reservedTaskParamKeyAddTask = "_add_task"_sr; +Key Task::reservedTaskParamKeyDone = "done"_sr; +Key Task::reservedTaskParamKeyPriority = "priority"_sr; +Key Task::reservedTaskParamKeyFuture = "future"_sr; +Key Task::reservedTaskParamKeyBlockID = "blockid"_sr; +Key Task::reservedTaskParamKeyVersion = "version"_sr; +Key Task::reservedTaskParamValidKey = "_validkey"_sr; +Key Task::reservedTaskParamValidValue = "_validvalue"_sr; // IMPORTANT: Task() must result in an EMPTY parameter set, so params should only // be set for non-default constructor arguments. To change this behavior look at all @@ -412,7 +412,7 @@ public: Reference futureBucket, Reference task) { state Reference taskFunc; - state VerifyTask verifyTask = false; + state VerifyTask verifyTask(false); if (!task || !TaskFuncBase::isValidTask(task)) return false; @@ -722,7 +722,7 @@ public: Reference taskBucket) { taskBucket->setOptions(tr); - Optional val = wait(tr->get(taskBucket->prefix.pack(LiteralStringRef("task_count")))); + Optional val = wait(tr->get(taskBucket->prefix.pack("task_count"_sr))); if (!val.present()) return 0; @@ -873,10 +873,10 @@ TaskBucket::TaskBucket(const Subspace& subspace, : cc("TaskBucket"), dispatchSlotChecksStarted("DispatchSlotChecksStarted", cc), dispatchErrors("DispatchErrors", cc), dispatchDoTasks("DispatchDoTasks", cc), dispatchEmptyTasks("DispatchEmptyTasks", cc), dispatchSlotChecksComplete("DispatchSlotChecksComplete", cc), dbgid(deterministicRandom()->randomUniqueID()), - prefix(subspace), active(prefix.get(LiteralStringRef("ac"))), pauseKey(prefix.pack(LiteralStringRef("pause"))), - available(prefix.get(LiteralStringRef("av"))), available_prioritized(prefix.get(LiteralStringRef("avp"))), - timeouts(prefix.get(LiteralStringRef("to"))), timeout(CLIENT_KNOBS->TASKBUCKET_TIMEOUT_VERSIONS), - system_access(sysAccess), priority_batch(priorityBatch), lockAware(lockAware) {} + prefix(subspace), active(prefix.get("ac"_sr)), pauseKey(prefix.pack("pause"_sr)), available(prefix.get("av"_sr)), + available_prioritized(prefix.get("avp"_sr)), timeouts(prefix.get("to"_sr)), + timeout(CLIENT_KNOBS->TASKBUCKET_TIMEOUT_VERSIONS), system_access(sysAccess), priority_batch(priorityBatch), + lockAware(lockAware) {} TaskBucket::~TaskBucket() {} @@ -919,9 +919,7 @@ Key TaskBucket::addTask(Reference tr, Reference for (auto& param : task->params) tr->set(taskSpace.pack(param.key), param.value); - tr->atomicOp(prefix.pack(LiteralStringRef("task_count")), - LiteralStringRef("\x01\x00\x00\x00\x00\x00\x00\x00"), - MutationRef::AddValue); + tr->atomicOp(prefix.pack("task_count"_sr), "\x01\x00\x00\x00\x00\x00\x00\x00"_sr, MutationRef::AddValue); return key; } @@ -995,9 +993,7 @@ Future TaskBucket::finish(Reference tr, Referen Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key); - tr->atomicOp(prefix.pack(LiteralStringRef("task_count")), - LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff"), - MutationRef::AddValue); + tr->atomicOp(prefix.pack("task_count"_sr), "\xff\xff\xff\xff\xff\xff\xff\xff"_sr, MutationRef::AddValue); tr->clear(timeouts.range(t)); return Void(); @@ -1028,7 +1024,7 @@ Future TaskBucket::getTaskCount(Reference tr } Future TaskBucket::watchTaskCount(Reference tr) { - return tr->watch(prefix.pack(LiteralStringRef("task_count"))); + return tr->watch(prefix.pack("task_count"_sr)); } Future TaskBucket::debugPrintRange(Reference tr, Subspace subspace, Key msg) { @@ -1103,7 +1099,7 @@ public: Key key = StringRef(deterministicRandom()->randomUniqueID().toString()); taskFuture->addBlock(tr, key); auto task = makeReference(); - task->params[Task::reservedTaskParamKeyType] = LiteralStringRef("UnblockFuture"); + task->params[Task::reservedTaskParamKeyType] = "UnblockFuture"_sr; task->params[Task::reservedTaskParamKeyFuture] = taskFuture->key; task->params[Task::reservedTaskParamKeyBlockID] = key; onSetFutures.push_back(vectorFuture[i]->onSet(tr, taskBucket, task)); @@ -1217,7 +1213,7 @@ public: taskFuture->futureBucket->setOptions(tr); task->params[Task::reservedTaskParamKeyAddTask] = task->params[Task::reservedTaskParamKeyType]; - task->params[Task::reservedTaskParamKeyType] = LiteralStringRef("AddTask"); + task->params[Task::reservedTaskParamKeyType] = "AddTask"_sr; wait(onSet(tr, taskBucket, taskFuture, task)); return Void(); @@ -1282,14 +1278,14 @@ TaskFuture::TaskFuture(const Reference bucket, Key k) : futureBuck } prefix = futureBucket->prefix.get(key); - blocks = prefix.get(LiteralStringRef("bl")); - callbacks = prefix.get(LiteralStringRef("cb")); + blocks = prefix.get("bl"_sr); + callbacks = prefix.get("cb"_sr); } TaskFuture::~TaskFuture() {} void TaskFuture::addBlock(Reference tr, StringRef block_id) { - tr->set(blocks.pack(block_id), LiteralStringRef("")); + tr->set(blocks.pack(block_id), ""_sr); } Future TaskFuture::set(Reference tr, Reference taskBucket) { diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp index 8ab6ab6701..2ad1989fd0 100644 --- a/fdbclient/Tenant.cpp +++ b/fdbclient/Tenant.cpp @@ -21,19 +21,30 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/SystemData.h" #include "fdbclient/Tenant.h" +#include "fdbrpc/TenantInfo.h" +#include "flow/BooleanParam.h" +#include "flow/IRandom.h" +#include "libb64/decode.h" #include "libb64/encode.h" +#include "flow/ApiVersion.h" #include "flow/UnitTest.h" +FDB_DEFINE_BOOLEAN_PARAM(EnforceValidTenantId); + Key TenantMapEntry::idToPrefix(int64_t id) { int64_t swapped = bigEndian64(id); - return StringRef(reinterpret_cast(&swapped), 8); + return StringRef(reinterpret_cast(&swapped), TENANT_PREFIX_SIZE); } -int64_t TenantMapEntry::prefixToId(KeyRef prefix) { - ASSERT(prefix.size() == 8); +int64_t TenantMapEntry::prefixToId(KeyRef prefix, EnforceValidTenantId enforceValidTenantId) { + ASSERT(prefix.size() == TENANT_PREFIX_SIZE); int64_t id = *reinterpret_cast(prefix.begin()); id = bigEndian64(id); - ASSERT(id >= 0); + if (enforceValidTenantId) { + ASSERT(id >= 0); + } else if (id < 0) { + return TenantInfo::INVALID_TENANT; + } return id; } @@ -78,6 +89,31 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) { UNREACHABLE(); } +std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState) { + switch (tenantState) { + case TenantLockState::UNLOCKED: + return "unlocked"; + case TenantLockState::READ_ONLY: + return "read only"; + case TenantLockState::LOCKED: + return "locked"; + default: + UNREACHABLE(); + } +} + +TenantLockState TenantMapEntry::stringToTenantLockState(std::string stateStr) { + if (stateStr == "unlocked") { + return TenantLockState::UNLOCKED; + } else if (stateStr == "read only") { + return TenantLockState::READ_ONLY; + } else if (stateStr == "locked") { + return TenantLockState::LOCKED; + } + + UNREACHABLE(); +} + TenantMapEntry::TenantMapEntry() {} TenantMapEntry::TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted) : tenantState(tenantState), encrypted(encrypted) { @@ -97,24 +133,19 @@ void TenantMapEntry::setId(int64_t id) { prefix = idToPrefix(id); } -std::string TenantMapEntry::toJson(int apiVersion) const { +std::string TenantMapEntry::toJson() const { json_spirit::mObject tenantEntry; tenantEntry["id"] = id; tenantEntry["encrypted"] = encrypted; - if (apiVersion >= 720 || apiVersion == Database::API_VERSION_LATEST) { - json_spirit::mObject prefixObject; - std::string encodedPrefix = base64::encoder::from_string(prefix.toString()); - // Remove trailing newline - encodedPrefix.resize(encodedPrefix.size() - 1); + json_spirit::mObject prefixObject; + std::string encodedPrefix = base64::encoder::from_string(prefix.toString()); + // Remove trailing newline + encodedPrefix.resize(encodedPrefix.size() - 1); - prefixObject["base64"] = encodedPrefix; - prefixObject["printable"] = printable(prefix); - tenantEntry["prefix"] = prefixObject; - } else { - // This is not a standard encoding in JSON, and some libraries may not be able to easily decode it - tenantEntry["prefix"] = prefix.toString(); - } + prefixObject["base64"] = encodedPrefix; + prefixObject["printable"] = printable(prefix); + tenantEntry["prefix"] = prefixObject; tenantEntry["tenant_state"] = TenantMapEntry::tenantStateToString(tenantState); if (assignedCluster.present()) { @@ -141,12 +172,23 @@ bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const { void TenantMapEntry::configure(Standalone parameter, Optional value) { if (parameter == "tenant_group"_sr) { tenantGroup = value; + } else if (parameter == "assigned_cluster"_sr) { + assignedCluster = value; } else { TraceEvent(SevWarnAlways, "UnknownTenantConfigurationParameter").detail("Parameter", parameter); throw invalid_tenant_configuration(); } } +json_spirit::mObject TenantGroupEntry::toJson() const { + json_spirit::mObject tenantGroupEntry; + if (assignedCluster.present()) { + tenantGroupEntry["assigned_cluster"] = assignedCluster.get().toString(); + } + + return tenantGroupEntry; +} + TenantMetadataSpecification& TenantMetadata::instance() { static TenantMetadataSpecification _instance = TenantMetadataSpecification("\xff/"_sr); return _instance; @@ -157,6 +199,24 @@ Key TenantMetadata::tenantMapPrivatePrefix() { return _prefix; } +TEST_CASE("/fdbclient/libb64/base64decoder") { + Standalone buf = makeString(100); + for (int i = 0; i < 1000; ++i) { + int length = deterministicRandom()->randomInt(0, 100); + deterministicRandom()->randomBytes(mutateString(buf), length); + + StringRef str = buf.substr(0, length); + std::string encodedStr = base64::encoder::from_string(str.toString()); + // Remove trailing newline + encodedStr.resize(encodedStr.size() - 1); + + std::string decodedStr = base64::decoder::from_string(encodedStr); + ASSERT(decodedStr == str.toString()); + } + + return Void(); +} + TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { TenantMapEntry entry1(1, TenantState::READY, false); ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr); diff --git a/fdbclient/TenantSpecialKeys.cpp b/fdbclient/TenantSpecialKeys.cpp deleted file mode 100644 index 5570816684..0000000000 --- a/fdbclient/TenantSpecialKeys.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * TenantSpecialKeys.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "fdbclient/TenantSpecialKeys.actor.h" - -template <> -const KeyRangeRef TenantRangeImpl::submoduleRange = KeyRangeRef("tenant/"_sr, "tenant0"_sr); - -template <> -const KeyRangeRef TenantRangeImpl::mapSubRange = KeyRangeRef("map/"_sr, "map0"_sr); - -template <> -const KeyRangeRef TenantRangeImpl::submoduleRange = KeyRangeRef(""_sr, "\xff"_sr); - -template <> -const KeyRangeRef TenantRangeImpl::mapSubRange = KeyRangeRef("tenant_map/"_sr, "tenant_map0"_sr); - -template <> -bool TenantRangeImpl::subRangeIntersects(KeyRangeRef subRange, KeyRangeRef range) { - return subRange.intersects(range); -} - -template <> -bool TenantRangeImpl::subRangeIntersects(KeyRangeRef subRange, KeyRangeRef range) { - return subRange == mapSubRange; -} \ No newline at end of file diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 7684980c7b..eec04e46e3 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -55,7 +55,7 @@ Reference ThreadSafeDatabase::openTenant(TenantNameRef tenantName) { } Reference ThreadSafeDatabase::createTransaction() { - auto type = isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW; + auto type = isConfigDB ? ISingleThreadTransaction::Type::PAXOS_CONFIG : ISingleThreadTransaction::Type::RYW; return Reference(new ThreadSafeTransaction(db, type, Optional())); } @@ -88,6 +88,7 @@ ThreadFuture ThreadSafeDatabase::rebootWorker(const StringRef& address, DatabaseContext* db = this->db; Key addressKey = address; return onMainThread([db, addressKey, check, duration]() -> Future { + db->checkDeferredError(); return db->rebootWorker(addressKey, check, duration); }); } @@ -95,14 +96,20 @@ ThreadFuture ThreadSafeDatabase::rebootWorker(const StringRef& address, ThreadFuture ThreadSafeDatabase::forceRecoveryWithDataLoss(const StringRef& dcid) { DatabaseContext* db = this->db; Key dcidKey = dcid; - return onMainThread([db, dcidKey]() -> Future { return db->forceRecoveryWithDataLoss(dcidKey); }); + return onMainThread([db, dcidKey]() -> Future { + db->checkDeferredError(); + return db->forceRecoveryWithDataLoss(dcidKey); + }); } ThreadFuture ThreadSafeDatabase::createSnapshot(const StringRef& uid, const StringRef& snapshot_command) { DatabaseContext* db = this->db; Key snapUID = uid; Key cmd = snapshot_command; - return onMainThread([db, snapUID, cmd]() -> Future { return db->createSnapshot(snapUID, cmd); }); + return onMainThread([db, snapUID, cmd]() -> Future { + db->checkDeferredError(); + return db->createSnapshot(snapUID, cmd); + }); } ThreadFuture ThreadSafeDatabase::createSharedState() { @@ -126,14 +133,17 @@ double ThreadSafeDatabase::getMainThreadBusyness() { // Note: this will never return if the server is running a protocol from FDB 5.0 or older ThreadFuture ThreadSafeDatabase::getServerProtocol(Optional expectedVersion) { DatabaseContext* db = this->db; - return onMainThread( - [db, expectedVersion]() -> Future { return db->getClusterProtocol(expectedVersion); }); + return onMainThread([db, expectedVersion]() -> Future { + db->checkDeferredError(); + return db->getClusterProtocol(expectedVersion); + }); } ThreadFuture ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) { DatabaseContext* db = this->db; KeyRange range = keyRange; return onMainThread([db, range, purgeVersion, force]() -> Future { + db->checkDeferredError(); return db->purgeBlobGranules(range, purgeVersion, {}, force); }); } @@ -141,33 +151,47 @@ ThreadFuture ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRa ThreadFuture ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) { DatabaseContext* db = this->db; Key key = purgeKey; - return onMainThread([db, key]() -> Future { return db->waitPurgeGranulesComplete(key); }); + return onMainThread([db, key]() -> Future { + db->checkDeferredError(); + return db->waitPurgeGranulesComplete(key); + }); } ThreadFuture ThreadSafeDatabase::blobbifyRange(const KeyRangeRef& keyRange) { DatabaseContext* db = this->db; KeyRange range = keyRange; - return onMainThread([=]() -> Future { return db->blobbifyRange(range); }); + return onMainThread([=]() -> Future { + db->checkDeferredError(); + return db->blobbifyRange(range); + }); } ThreadFuture ThreadSafeDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { DatabaseContext* db = this->db; KeyRange range = keyRange; - return onMainThread([=]() -> Future { return db->blobbifyRange(range); }); + return onMainThread([=]() -> Future { + db->checkDeferredError(); + return db->unblobbifyRange(range); + }); } ThreadFuture>> ThreadSafeDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, int rangeLimit) { DatabaseContext* db = this->db; KeyRange range = keyRange; - return onMainThread( - [=]() -> Future>> { return db->listBlobbifiedRanges(range, rangeLimit); }); + return onMainThread([=]() -> Future>> { + db->checkDeferredError(); + return db->listBlobbifiedRanges(range, rangeLimit); + }); } ThreadFuture ThreadSafeDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { DatabaseContext* db = this->db; KeyRange range = keyRange; - return onMainThread([=]() -> Future { return db->verifyBlobRange(range, version); }); + return onMainThread([=]() -> Future { + db->checkDeferredError(); + return db->verifyBlobRange(range, version); + }); } ThreadSafeDatabase::ThreadSafeDatabase(ConnectionRecordType connectionRecordType, @@ -200,7 +224,7 @@ ThreadSafeDatabase::~ThreadSafeDatabase() { } Reference ThreadSafeTenant::createTransaction() { - auto type = db->isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW; + auto type = db->isConfigDB ? ISingleThreadTransaction::Type::PAXOS_CONFIG : ISingleThreadTransaction::Type::RYW; return Reference(new ThreadSafeTransaction(db->db, type, name)); } @@ -216,7 +240,10 @@ ThreadFuture ThreadSafeTenant::purgeBlobGranules(const KeyRangeRef& keyRang ThreadFuture ThreadSafeTenant::waitPurgeGranulesComplete(const KeyRef& purgeKey) { DatabaseContext* db = this->db->db; Key key = purgeKey; - return onMainThread([db, key]() -> Future { return db->waitPurgeGranulesComplete(key); }); + return onMainThread([db, key]() -> Future { + db->checkDeferredError(); + return db->waitPurgeGranulesComplete(key); + }); } ThreadSafeTenant::~ThreadSafeTenant() {} @@ -400,34 +427,53 @@ ThreadResult ThreadSafeTransaction::readBlobGranules(const KeyRange Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) { - // FIXME: prevent from calling this from another main thread! + // This should not be called directly, bypassMultiversionApi should not be set + return ThreadResult(unsupported_operation()); +} +ThreadFuture>> ThreadSafeTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { ISingleThreadTransaction* tr = this->tr; KeyRange r = keyRange; - int64_t readVersionOut; - ThreadFuture>> getFilesFuture = onMainThread( - [tr, r, beginVersion, readVersion, &readVersionOut]() -> Future>> { + return onMainThread( + [tr, r, beginVersion, readVersion, readVersionOut]() -> Future>> { tr->checkDeferredError(); - return tr->readBlobGranules(r, beginVersion, readVersion, &readVersionOut); + return tr->readBlobGranules(r, beginVersion, readVersion, readVersionOut); }); +} - // FIXME: can this safely avoid another main thread jump? - getFilesFuture.blockUntilReadyCheckOnMainThread(); - - // propagate error to client - if (getFilesFuture.isError()) { - return ThreadResult(getFilesFuture.getError()); - } - - Standalone> files = getFilesFuture.get(); - +ThreadResult ThreadSafeTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { // do this work off of fdb network threads for performance! - if (granule_context.debugNoMaterialize) { - return ThreadResult(blob_granule_not_materialized()); - } else { - return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersionOut, granule_context); + Standalone> files = startFuture.get(); + GranuleMaterializeStats stats; + auto ret = loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersion, granuleContext, stats); + if (!ret.isError()) { + ISingleThreadTransaction* tr = this->tr; + onMainThreadVoid([tr, stats]() { tr->addGranuleMaterializeStats(stats); }); } + return ret; +} + +ThreadFuture>> ThreadSafeTransaction::summarizeBlobGranules( + const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) { + ISingleThreadTransaction* tr = this->tr; + KeyRange r = keyRange; + + return onMainThread([=]() -> Future>> { + tr->checkDeferredError(); + return tr->summarizeBlobGranules(r, summaryVersion, rangeLimit); + }); } void ThreadSafeTransaction::addReadConflictRange(const KeyRangeRef& keys) { @@ -525,22 +571,34 @@ Version ThreadSafeTransaction::getCommittedVersion() { ThreadFuture ThreadSafeTransaction::getVersionVector() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future { return tr->getVersionVector(); }); + return onMainThread([tr]() -> Future { + tr->checkDeferredError(); + return tr->getVersionVector(); + }); } ThreadFuture ThreadSafeTransaction::getSpanContext() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future { return tr->getSpanContext(); }); + return onMainThread([tr]() -> Future { + tr->checkDeferredError(); + return tr->getSpanContext(); + }); } ThreadFuture ThreadSafeTransaction::getApproximateSize() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future { return tr->getApproximateSize(); }); + return onMainThread([tr]() -> Future { + tr->checkDeferredError(); + return tr->getApproximateSize(); + }); } ThreadFuture> ThreadSafeTransaction::getVersionstamp() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future> { return tr->getVersionstamp(); }); + return onMainThread([tr]() -> Future> { + tr->checkDeferredError(); + return tr->getVersionstamp(); + }); } void ThreadSafeTransaction::setOption(FDBTransactionOptions::Option option, Optional value) { @@ -602,7 +660,7 @@ extern const char* getSourceVersion(); ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), transportId(0) {} void ThreadSafeApi::selectApiVersion(int apiVersion) { - this->apiVersion = apiVersion; + this->apiVersion = ApiVersion(apiVersion); } const char* ThreadSafeApi::getClientVersion() { @@ -674,12 +732,12 @@ void ThreadSafeApi::stopNetwork() { Reference ThreadSafeApi::createDatabase(const char* clusterFilePath) { return Reference( - new ThreadSafeDatabase(ThreadSafeDatabase::ConnectionRecordType::FILE, clusterFilePath, apiVersion)); + new ThreadSafeDatabase(ThreadSafeDatabase::ConnectionRecordType::FILE, clusterFilePath, apiVersion.version())); } Reference ThreadSafeApi::createDatabaseFromConnectionString(const char* connectionString) { return Reference(new ThreadSafeDatabase( - ThreadSafeDatabase::ConnectionRecordType::CONNECTION_STRING, connectionString, apiVersion)); + ThreadSafeDatabase::ConnectionRecordType::CONNECTION_STRING, connectionString, apiVersion.version())); } void ThreadSafeApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) { diff --git a/fdbclient/Tracing.actor.cpp b/fdbclient/Tracing.actor.cpp index 815e568acb..3100b208fc 100644 --- a/fdbclient/Tracing.actor.cpp +++ b/fdbclient/Tracing.actor.cpp @@ -497,7 +497,7 @@ TEST_CASE("/flow/Tracing/AddEvents") { auto arena = span1.arena; SmallVectorRef attrs; attrs.push_back(arena, KeyValueRef("foo"_sr, "bar"_sr)); - span1.addEvent(LiteralStringRef("read_version"), 1.0, attrs); + span1.addEvent("read_version"_sr, 1.0, attrs); ASSERT(span1.events[0].name.toString() == "read_version"); ASSERT(span1.events[0].time == 1.0); ASSERT(span1.events[0].attributes.begin()->key.toString() == "foo"); @@ -505,7 +505,7 @@ TEST_CASE("/flow/Tracing/AddEvents") { // Use helper method to add an OTELEventRef with no attributes to an OTELSpan Span span2("span_with_event"_loc); - span2.addEvent(StringRef(span2.arena, LiteralStringRef("commit_succeed")), 1234567.100); + span2.addEvent(StringRef(span2.arena, "commit_succeed"_sr), 1234567.100); ASSERT(span2.events[0].name.toString() == "commit_succeed"); ASSERT(span2.events[0].time == 1234567.100); ASSERT(span2.events[0].attributes.size() == 0); @@ -537,8 +537,8 @@ TEST_CASE("/flow/Tracing/AddAttributes") { IKnobCollection::getMutableGlobalKnobCollection().setKnob("tracing_span_attributes_enabled", KnobValueRef::create(bool{ true })); auto arena = span1.arena; - span1.addAttribute(StringRef(arena, LiteralStringRef("foo")), StringRef(arena, LiteralStringRef("bar"))); - span1.addAttribute(StringRef(arena, LiteralStringRef("operation")), StringRef(arena, LiteralStringRef("grv"))); + span1.addAttribute(StringRef(arena, "foo"_sr), StringRef(arena, "bar"_sr)); + span1.addAttribute(StringRef(arena, "operation"_sr), StringRef(arena, "grv"_sr)); ASSERT_EQ(span1.attributes.size(), 3); // Includes default attribute of "address" ASSERT(span1.attributes[1] == KeyValueRef("foo"_sr, "bar"_sr)); ASSERT(span1.attributes[2] == KeyValueRef("operation"_sr, "grv"_sr)); @@ -548,9 +548,9 @@ TEST_CASE("/flow/Tracing/AddAttributes") { deterministicRandom()->randomUInt64(), TraceFlags::sampled)); auto s2Arena = span2.arena; - span2.addAttribute(StringRef(s2Arena, LiteralStringRef("a")), StringRef(s2Arena, LiteralStringRef("1"))) - .addAttribute(StringRef(s2Arena, LiteralStringRef("b")), LiteralStringRef("2")) - .addAttribute(StringRef(s2Arena, LiteralStringRef("c")), LiteralStringRef("3")); + span2.addAttribute(StringRef(s2Arena, "a"_sr), StringRef(s2Arena, "1"_sr)) + .addAttribute(StringRef(s2Arena, "b"_sr), "2"_sr) + .addAttribute(StringRef(s2Arena, "c"_sr), "3"_sr); ASSERT_EQ(span2.attributes.size(), 4); // Includes default attribute of "address" ASSERT(span2.attributes[1] == KeyValueRef("a"_sr, "1"_sr)); @@ -718,7 +718,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") { attrs.push_back(s3Arena, KeyValueRef("foo"_sr, "bar"_sr)); span3.addAttribute("operation"_sr, "grv"_sr) .addLink(SpanContext(UID(300, 301), 400, TraceFlags::sampled)) - .addEvent(StringRef(s3Arena, LiteralStringRef("event1")), 100.101, attrs); + .addEvent(StringRef(s3Arena, "event1"_sr), 100.101, attrs); tracer.serialize_span(span3, request); data = request.buffer.get(); ASSERT(data[0] == 0b10011100); // 12 element array. diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp index d3c3416b88..042793a757 100644 --- a/fdbclient/Tuple.cpp +++ b/fdbclient/Tuple.cpp @@ -22,6 +22,8 @@ #include "flow/UnitTest.h" const uint8_t VERSIONSTAMP_96_CODE = 0x33; +const uint8_t USER_TYPE_START = 0x40; +const uint8_t USER_TYPE_END = 0x4f; // TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point. static float bigEndianFloat(float orig) { @@ -59,7 +61,7 @@ static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) { } } -Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { +Tuple::Tuple(StringRef const& str, bool exclude_incomplete, bool include_user_type) { data.append(data.arena(), str.begin(), str.size()); size_t i = 0; @@ -80,6 +82,9 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { i += 1; } else if (data[i] == VERSIONSTAMP_96_CODE) { i += VERSIONSTAMP_TUPLE_SIZE + 1; + } else if (include_user_type && isUserType(data[i])) { + // User defined codes must come at the end of a Tuple and are not delimited. + i = data.size(); } else { throw invalid_tuple_data_type(); } @@ -94,6 +99,14 @@ Tuple Tuple::unpack(StringRef const& str, bool exclude_incomplete) { return Tuple(str, exclude_incomplete); } +Tuple Tuple::unpackUserType(StringRef const& str, bool exclude_incomplete) { + return Tuple(str, exclude_incomplete, true); +} + +bool Tuple::isUserType(uint8_t code) const { + return code >= USER_TYPE_START && code <= USER_TYPE_END; +} + Tuple& Tuple::append(Tuple const& tuple) { for (size_t offset : tuple.offsets) { offsets.push_back(offset + data.size()); @@ -104,7 +117,7 @@ Tuple& Tuple::append(Tuple const& tuple) { return *this; } -Tuple& Tuple::append(Versionstamp const& vs) { +Tuple& Tuple::append(TupleVersionstamp const& vs) { offsets.push_back(data.size()); data.push_back(data.arena(), VERSIONSTAMP_96_CODE); @@ -208,7 +221,7 @@ Tuple& Tuple::append(double value) { return *this; } -Tuple& Tuple::append(nullptr_t) { +Tuple& Tuple::append(std::nullptr_t) { offsets.push_back(data.size()); data.push_back(data.arena(), (uint8_t)'\x00'); return *this; @@ -218,6 +231,15 @@ Tuple& Tuple::appendNull() { return append(nullptr); } +Tuple& Tuple::append(Tuple::UserTypeStr const& udt) { + offsets.push_back(data.size()); + ASSERT(isUserType(udt.code)); + data.push_back(data.arena(), udt.code); + data.append(data.arena(), udt.str.begin(), udt.str.size()); + + return *this; +} + Tuple::ElementType Tuple::getType(size_t index) const { if (index >= offsets.size()) { throw invalid_tuple_index(); @@ -241,6 +263,8 @@ Tuple::ElementType Tuple::getType(size_t index) const { return ElementType::BOOL; } else if (code == VERSIONSTAMP_96_CODE) { return ElementType::VERSIONSTAMP; + } else if (isUserType(code)) { + return ElementType::USER_TYPE; } else { throw invalid_tuple_data_type(); } @@ -389,7 +413,7 @@ double Tuple::getDouble(size_t index) const { return bigEndianDouble(swap); } -Versionstamp Tuple::getVersionstamp(size_t index) const { +TupleVersionstamp Tuple::getVersionstamp(size_t index) const { if (index >= offsets.size()) { throw invalid_tuple_index(); } @@ -398,7 +422,30 @@ Versionstamp Tuple::getVersionstamp(size_t index) const { if (code != VERSIONSTAMP_96_CODE) { throw invalid_tuple_data_type(); } - return Versionstamp(StringRef(data.begin() + offsets[index] + 1, VERSIONSTAMP_TUPLE_SIZE)); + return TupleVersionstamp(StringRef(data.begin() + offsets[index] + 1, VERSIONSTAMP_TUPLE_SIZE)); +} + +Tuple::UserTypeStr Tuple::getUserType(size_t index) const { + // Valid index. + if (index >= offsets.size()) { + throw invalid_tuple_index(); + } + + // Valid user type code. + ASSERT_LT(offsets[index], data.size()); + uint8_t code = data[offsets[index]]; + if (!isUserType(code)) { + throw invalid_tuple_data_type(); + } + + size_t start = offsets[index] + 1; + + Standalone str; + VectorRef staging; + staging.append(str.arena(), data.begin() + start, data.size() - start); + str.StringRef::operator=(StringRef(staging.begin(), staging.size())); + + return Tuple::UserTypeStr(code, str); } KeyRange Tuple::range(Tuple const& tuple) const { @@ -440,9 +487,16 @@ StringRef Tuple::subTupleRawString(size_t index) const { return StringRef(data.begin() + offsets[index], endPos - offsets[index]); } -TEST_CASE("fdbclient/Tuple/makeTuple") { - Tuple t1 = Tuple::makeTuple( - 1, 1.0f, 1.0, false, "byteStr"_sr, Tuple::UnicodeStr("str"_sr), nullptr, Versionstamp("000000000000"_sr)); +TEST_CASE("/fdbclient/Tuple/makeTuple") { + Tuple t1 = Tuple::makeTuple(1, + 1.0f, + 1.0, + false, + "byteStr"_sr, + Tuple::UnicodeStr("str"_sr), + nullptr, + TupleVersionstamp("000000000000"_sr), + Tuple::UserTypeStr(0x41, "12345678"_sr)); Tuple t2 = Tuple() .append(1) .append(1.0f) @@ -451,7 +505,8 @@ TEST_CASE("fdbclient/Tuple/makeTuple") { .append("byteStr"_sr) .append(Tuple::UnicodeStr("str"_sr)) .append(nullptr) - .append(Versionstamp("000000000000"_sr)); + .append(TupleVersionstamp("000000000000"_sr)) + .append(Tuple::UserTypeStr(0x41, "12345678"_sr)); ASSERT(t1.pack() == t2.pack()); ASSERT(t1.getType(0) == Tuple::INT); @@ -462,7 +517,45 @@ TEST_CASE("fdbclient/Tuple/makeTuple") { ASSERT(t1.getType(5) == Tuple::UTF8); ASSERT(t1.getType(6) == Tuple::NULL_TYPE); ASSERT(t1.getType(7) == Tuple::VERSIONSTAMP); - ASSERT(t1.size() == 8); + ASSERT(t1.getType(8) == Tuple::USER_TYPE); + ASSERT(t1.size() == 9); + + return Void(); +} + +TEST_CASE("/fdbclient/Tuple/unpack") { + Tuple t1 = Tuple::makeTuple(1, + 1.0f, + 1.0, + false, + "byteStr"_sr, + Tuple::UnicodeStr("str"_sr), + nullptr, + TupleVersionstamp("000000000000"_sr), + Tuple::UserTypeStr(0x41, "12345678"_sr)); + + Standalone packed = t1.pack(); + Tuple t2 = Tuple::unpackUserType(packed); + ASSERT(t2.pack() == t1.pack()); + ASSERT(t2.getInt(0) == t1.getInt(0)); + ASSERT(t2.getFloat(1) == t1.getFloat(1)); + ASSERT(t2.getDouble(2) == t1.getDouble(2)); + ASSERT(t2.getBool(3) == t1.getBool(3)); + ASSERT(t2.getString(4) == t1.getString(4)); + ASSERT(t2.getString(5) == t1.getString(5)); + ASSERT(t2.getType(6) == Tuple::NULL_TYPE); + ASSERT(t2.getVersionstamp(7) == t1.getVersionstamp(7)); + ASSERT(t2.getUserType(8) == t1.getUserType(8)); + ASSERT(t2.size() == 9); + + try { + Tuple t3 = Tuple::unpack(packed); + ASSERT(false); + } catch (Error& e) { + if (e.code() != error_code_invalid_tuple_data_type) { + throw e; + } + } return Void(); } diff --git a/fdbclient/Versionstamp.cpp b/fdbclient/TupleVersionstamp.cpp similarity index 64% rename from fdbclient/Versionstamp.cpp rename to fdbclient/TupleVersionstamp.cpp index 7a61936040..91e0182f4f 100644 --- a/fdbclient/Versionstamp.cpp +++ b/fdbclient/TupleVersionstamp.cpp @@ -1,13 +1,13 @@ -#include "fdbclient/Versionstamp.h" +#include "fdbclient/TupleVersionstamp.h" -Versionstamp::Versionstamp(StringRef str) { +TupleVersionstamp::TupleVersionstamp(StringRef str) { if (str.size() != VERSIONSTAMP_TUPLE_SIZE) { throw invalid_versionstamp_size(); } data = str; } -int16_t Versionstamp::getBatchNumber() const { +int16_t TupleVersionstamp::getBatchNumber() const { const uint8_t* begin = data.begin(); begin += 8; int16_t batchNumber = *(int16_t*)(begin); @@ -15,7 +15,7 @@ int16_t Versionstamp::getBatchNumber() const { return batchNumber; } -int16_t Versionstamp::getUserVersion() const { +int16_t TupleVersionstamp::getUserVersion() const { const uint8_t* begin = data.begin(); begin += 10; int16_t userVersion = *(int16_t*)(begin); @@ -23,22 +23,22 @@ int16_t Versionstamp::getUserVersion() const { return userVersion; } -const uint8_t* Versionstamp::begin() const { +const uint8_t* TupleVersionstamp::begin() const { return data.begin(); } -int64_t Versionstamp::getVersion() const { +int64_t TupleVersionstamp::getVersion() const { const uint8_t* begin = data.begin(); int64_t version = *(int64_t*)begin; version = bigEndian64(version); return version; } -size_t Versionstamp::size() const { +size_t TupleVersionstamp::size() const { return VERSIONSTAMP_TUPLE_SIZE; } -bool Versionstamp::operator==(const Versionstamp& other) const { +bool TupleVersionstamp::operator==(const TupleVersionstamp& other) const { return getVersion() == other.getVersion() && getBatchNumber() == other.getBatchNumber() && getUserVersion() == other.getUserVersion(); } \ No newline at end of file diff --git a/fdbclient/include/fdbclient/Atomic.h b/fdbclient/include/fdbclient/Atomic.h index 1d19150b04..61f948e38f 100644 --- a/fdbclient/include/fdbclient/Atomic.h +++ b/fdbclient/include/fdbclient/Atomic.h @@ -120,7 +120,7 @@ inline ValueRef doAppendIfFits(const Optional& existingValueOptional, if (!otherOperand.size()) return existingValue; if (existingValue.size() + otherOperand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT) { - CODE_PROBE(true, "AppendIfFIts resulted in truncation"); + CODE_PROBE(true, "AppendIfFits resulted in truncation"); return existingValue; } diff --git a/fdbclient/include/fdbclient/BackupAgent.actor.h b/fdbclient/include/fdbclient/BackupAgent.actor.h index e0eb69ede6..314f151fd0 100644 --- a/fdbclient/include/fdbclient/BackupAgent.actor.h +++ b/fdbclient/include/fdbclient/BackupAgent.actor.h @@ -143,7 +143,7 @@ public: futureBucket = std::move(r.futureBucket); } - KeyBackedProperty lastBackupTimestamp() { return config.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty lastBackupTimestamp() { return config.pack(__FUNCTION__sr); } Future run(Database cx, double pollDelay, int maxConcurrentTasks) { return taskBucket->run(cx, futureBucket, std::make_shared(pollDelay), maxConcurrentTasks); @@ -208,33 +208,14 @@ public: WaitForComplete waitForComplete = WaitForComplete::True, Version targetVersion = ::invalidVersion, Verbose verbose = Verbose::True, - KeyRange range = normalKeys, + KeyRange range = KeyRange(), Key addPrefix = Key(), Key removePrefix = Key(), LockDB lockDB = LockDB::True, OnlyApplyMutationLogs onlyApplyMutationLogs = OnlyApplyMutationLogs::False, InconsistentSnapshotOnly inconsistentSnapshotOnly = InconsistentSnapshotOnly::False, Version beginVersion = ::invalidVersion, - Optional const& encryptionKeyFileName = {}) { - Standalone> rangeRef; - rangeRef.push_back_deep(rangeRef.arena(), range); - return restore(cx, - cxOrig, - tagName, - url, - proxy, - rangeRef, - waitForComplete, - targetVersion, - verbose, - addPrefix, - removePrefix, - lockDB, - onlyApplyMutationLogs, - inconsistentSnapshotOnly, - beginVersion, - encryptionKeyFileName); - } + Optional const& encryptionKeyFileName = {}); Future atomicRestore(Database cx, Key tagName, Standalone> ranges, @@ -242,13 +223,10 @@ public: Key removePrefix = Key()); Future atomicRestore(Database cx, Key tagName, - KeyRange range = normalKeys, + KeyRange range = KeyRange(), Key addPrefix = Key(), - Key removePrefix = Key()) { - Standalone> rangeRef; - rangeRef.push_back_deep(rangeRef.arena(), range); - return atomicRestore(cx, tagName, rangeRef, addPrefix, removePrefix); - } + Key removePrefix = Key()); + // Tries to abort the restore for a tag. Returns the final (stable) state of the tag. Future abortRestore(Reference tr, Key tagName); Future abortRestore(Database cx, Key tagName); @@ -272,6 +250,7 @@ public: int snapshotIntervalSeconds, std::string const& tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone = StopWhenDone::True, UsePartitionedLog = UsePartitionedLog::False, IncrementalBackupOnly = IncrementalBackupOnly::False, @@ -283,6 +262,7 @@ public: int snapshotIntervalSeconds, std::string const& tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone stopWhenDone = StopWhenDone::True, UsePartitionedLog partitionedLog = UsePartitionedLog::False, IncrementalBackupOnly incrementalBackupOnly = IncrementalBackupOnly::False, @@ -295,6 +275,7 @@ public: snapshotIntervalSeconds, tagName, backupRanges, + encryptionEnabled, stopWhenDone, partitionedLog, incrementalBackupOnly, @@ -621,7 +602,7 @@ class TagUidMap : public KeyBackedMap { Snapshot snapshot); public: - TagUidMap(const StringRef& prefix) : TagMap(LiteralStringRef("tag->uid/").withPrefix(prefix)), prefix(prefix) {} + TagUidMap(const StringRef& prefix) : TagMap("tag->uid/"_sr.withPrefix(prefix)), prefix(prefix) {} Future> getAll(Reference tr, Snapshot snapshot = Snapshot::False) { @@ -652,11 +633,11 @@ static inline Future> getAllBackupTags(Reference uid() { return LiteralStringRef(__FUNCTION__); } + static TaskParam uid() { return __FUNCTION__sr; } } TaskParams; KeyBackedConfig(StringRef prefix, UID uid = UID()) - : uid(uid), prefix(prefix), configSpace(uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(prefix), uid)) {} + : uid(uid), prefix(prefix), configSpace(uidPrefixKey("uid->config/"_sr.withPrefix(prefix), uid)) {} KeyBackedConfig(StringRef prefix, Reference task) : KeyBackedConfig(prefix, TaskParams.uid().get(task)) {} @@ -685,7 +666,7 @@ public: }); } - KeyBackedProperty tag() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty tag() { return configSpace.pack(__FUNCTION__sr); } UID getUid() { return uid; } @@ -694,12 +675,10 @@ public: void clear(Reference tr) { tr->clear(configSpace.range()); } // lastError is a pair of error message and timestamp expressed as an int64_t - KeyBackedProperty> lastError() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty> lastError() { return configSpace.pack(__FUNCTION__sr); } KeyBackedMap> lastErrorPerType() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); + return configSpace.pack(__FUNCTION__sr); } // Updates the error per type map and the last error property @@ -788,47 +767,41 @@ public: // Map of range end boundaries to info about the backup file written for that range. typedef KeyBackedMap RangeFileMapT; - RangeFileMapT snapshotRangeFileMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + RangeFileMapT snapshotRangeFileMap() { return configSpace.pack(__FUNCTION__sr); } // Number of kv range files that were both committed to persistent storage AND inserted into // the snapshotRangeFileMap. Note that since insertions could replace 1 or more existing // map entries this is not necessarily the number of entries currently in the map. // This value exists to help with sizing of kv range folders for BackupContainers that // require it. - KeyBackedBinaryValue snapshotRangeFileCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue snapshotRangeFileCount() { return configSpace.pack(__FUNCTION__sr); } // Coalesced set of ranges already dispatched for writing. typedef KeyBackedMap RangeDispatchMapT; - RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(__FUNCTION__sr); } // Interval to use for the first (initial) snapshot. - KeyBackedProperty initialSnapshotIntervalSeconds() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty initialSnapshotIntervalSeconds() { return configSpace.pack(__FUNCTION__sr); } // Interval to use for determining the target end version for new snapshots - KeyBackedProperty snapshotIntervalSeconds() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotIntervalSeconds() { return configSpace.pack(__FUNCTION__sr); } // When the current snapshot began - KeyBackedProperty snapshotBeginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBeginVersion() { return configSpace.pack(__FUNCTION__sr); } // When the current snapshot is desired to end. // This can be changed at runtime to speed up or slow down a snapshot - KeyBackedProperty snapshotTargetEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotTargetEndVersion() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotBatchSize() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBatchSize() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotBatchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBatchFuture() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotBatchDispatchDoneKey() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBatchDispatchDoneKey() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotDispatchLastShardsBehind() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty snapshotDispatchLastShardsBehind() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotDispatchLastVersion() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty snapshotDispatchLastVersion() { return configSpace.pack(__FUNCTION__sr); } Future initNewSnapshot(Reference tr, int64_t intervalSeconds = -1) { BackupConfig& copy = *this; // Capture this by value instead of this ptr @@ -862,51 +835,50 @@ public: }); } - KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty stateEnum() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty stateEnum() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty> backupContainer() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty> backupContainer() { return configSpace.pack(__FUNCTION__sr); } // Set to true when all backup workers for saving mutation logs have been started. - KeyBackedProperty allWorkerStarted() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty allWorkerStarted() { return configSpace.pack(__FUNCTION__sr); } // Each backup worker adds its (epoch, tag.id) to this property. KeyBackedProperty>> startedBackupWorkers() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); + return configSpace.pack(__FUNCTION__sr); } // Set to true if backup worker is enabled. - KeyBackedProperty backupWorkerEnabled() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty backupWorkerEnabled() { return configSpace.pack(__FUNCTION__sr); } // Set to true if partitioned log is enabled (only useful if backup worker is also enabled). - KeyBackedProperty partitionedLogEnabled() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty partitionedLogEnabled() { return configSpace.pack(__FUNCTION__sr); } // Set to true if only requesting incremental backup without base snapshot. - KeyBackedProperty incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty incrementalBackupOnly() { return configSpace.pack(__FUNCTION__sr); } // Latest version for which all prior versions have saved by backup workers. - KeyBackedProperty latestBackupWorkerSavedVersion() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty latestBackupWorkerSavedVersion() { return configSpace.pack(__FUNCTION__sr); } // Stop differntial logging if already started or don't start after completing KV ranges - KeyBackedProperty stopWhenDone() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty stopWhenDone() { return configSpace.pack(__FUNCTION__sr); } + + // Enable snapshot backup file encryption + KeyBackedProperty enableSnapshotBackupEncryption() { return configSpace.pack(__FUNCTION__sr); } // Latest version for which all prior versions have had their log copy tasks completed - KeyBackedProperty latestLogEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty latestLogEndVersion() { return configSpace.pack(__FUNCTION__sr); } // The end version of the last complete snapshot - KeyBackedProperty latestSnapshotEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty latestSnapshotEndVersion() { return configSpace.pack(__FUNCTION__sr); } // The end version of the first complete snapshot - KeyBackedProperty firstSnapshotEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty firstSnapshotEndVersion() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty destUidValue() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty destUidValue() { return configSpace.pack(__FUNCTION__sr); } Future> getLatestRestorableVersion(Reference tr) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -937,7 +909,7 @@ public: }); } - KeyBackedProperty> backupRanges() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty> backupRanges() { return configSpace.pack(__FUNCTION__sr); } void startMutationLogs(Reference tr, KeyRangeRef backupRange, Key destUidValue) { Key mutationLogsDestKey = destUidValue.withPrefix(backupLogKeys.begin); @@ -1005,7 +977,8 @@ struct StringRefReader { namespace fileBackup { ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, - int len); + int len, + Optional cx); // Reads a mutation log block from file and parses into batch mutation blocks for further parsing. ACTOR Future>> decodeMutationLogFileBlock(Reference file, @@ -1027,5 +1000,41 @@ ACTOR Future transformRestoredDatabase(Database cx, void simulateBlobFailure(); +// Add the set of ranges that are backed up in a default backup to the given vector. This consists of all normal keys +// and the system backup ranges. +void addDefaultBackupRanges(Standalone>& backupKeys); + +// Return a vector containing the key ranges in system key-space that should be backed up in a default backup. +VectorRef const& getSystemBackupRanges(); + +// Return a key-range map that can be used to check whether a system key is a candidate backup key (i.e. whether it is +// part of any system backup ranges). +KeyRangeMap const& systemBackupMutationMask(); + +// Returns true if the given set of ranges exactly matches the set of ranges included in a default backup. +template +bool isDefaultBackup(Container ranges) { + std::unordered_set uniqueRanges(ranges.begin(), ranges.end()); + auto& systemBackupRanges = getSystemBackupRanges(); + + if (uniqueRanges.size() != systemBackupRanges.size() + 1) { + return false; + } + + if (!uniqueRanges.count(normalKeys)) { + return false; + } + for (auto range : getSystemBackupRanges()) { + if (!uniqueRanges.count(range)) { + return false; + } + } + + return true; +} + +// Returns a key-range used to denote that a shared mutation stream belongs to the default backup set. +KeyRangeRef const& getDefaultBackupSharedRange(); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/BackupContainer.h b/fdbclient/include/fdbclient/BackupContainer.h index 11d5c2ba27..95a1072a05 100644 --- a/fdbclient/include/fdbclient/BackupContainer.h +++ b/fdbclient/include/fdbclient/BackupContainer.h @@ -67,6 +67,9 @@ static const uint32_t PARTITIONED_MLOG_VERSION = 4110; // Snapshot file version written by FileBackupAgent static const uint32_t BACKUP_AGENT_SNAPSHOT_FILE_VERSION = 1001; +// Encrypted Snapshot file version written by FileBackupAgent +static const uint32_t BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION = 1002; + struct LogFile { Version beginVersion; Version endVersion; @@ -250,7 +253,7 @@ public: // Returns the key ranges in the snapshot file. This is an expensive function // and should only be used in simulation for sanity check. - virtual Future getSnapshotFileKeyRange(const RangeFile& file) = 0; + virtual Future getSnapshotFileKeyRange(const RangeFile& file, Optional cx) = 0; struct ExpireProgress { std::string step; @@ -289,6 +292,7 @@ public: // If logsOnly is set, only use log files in [beginVersion, targetVervions) in restore set. // Returns non-present if restoring to the given version is not possible. virtual Future> getRestoreSet(Version targetVersion, + Optional cx, VectorRef keyRangesFilter = {}, bool logsOnly = false, Version beginVersion = -1) = 0; diff --git a/fdbclient/include/fdbclient/BackupContainerFileSystem.h b/fdbclient/include/fdbclient/BackupContainerFileSystem.h index 17245c9c39..c6819a864e 100644 --- a/fdbclient/include/fdbclient/BackupContainerFileSystem.h +++ b/fdbclient/include/fdbclient/BackupContainerFileSystem.h @@ -152,9 +152,10 @@ public: ExpireProgress* progress, Version restorableBeginVersion) final; - Future getSnapshotFileKeyRange(const RangeFile& file) final; + Future getSnapshotFileKeyRange(const RangeFile& file, Optional cx) final; Future> getRestoreSet(Version targetVersion, + Optional cx, VectorRef keyRangesFilter, bool logsOnly, Version beginVersion) final; diff --git a/flow/include/flow/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h similarity index 82% rename from flow/include/flow/BlobCipher.h rename to fdbclient/include/fdbclient/BlobCipher.h index 27cbbc6c5f..7fc4519ccc 100644 --- a/flow/include/flow/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -17,10 +17,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef FLOW_BLOB_CIPHER_H -#define FLOW_BLOB_CIPHER_H +#ifndef FDBCLIENT_BLOB_CIPHER_H +#define FDBCLIENT_BLOB_CIPHER_H #pragma once +#include "fdbrpc/Stats.h" #include "flow/Arena.h" #include "flow/EncryptUtils.h" #include "flow/FastRef.h" @@ -28,6 +29,7 @@ #include "flow/genericactors.actor.h" #include "flow/Knobs.h" #include "flow/network.h" +#include "flow/Platform.h" #include "flow/ProtocolVersion.h" #include "flow/serialize.h" @@ -36,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +53,59 @@ #define AES_256_KEY_LENGTH 32 #define AES_256_IV_LENGTH 16 +class BlobCipherMetrics : public NonCopyable { +public: + static BlobCipherMetrics* getInstance() { + static BlobCipherMetrics* instance = nullptr; + if (instance == nullptr) { + instance = new BlobCipherMetrics; + } + return instance; + } + + // Order of this enum has to match initializer of counterSets. + enum UsageType : int { + TLOG = 0, + KV_MEMORY, + KV_REDWOOD, + BLOB_GRANULE, + BACKUP, + TEST, + MAX, + }; + + struct CounterSet { + Counter encryptCPUTimeNS; + Counter decryptCPUTimeNS; + LatencySample getCipherKeysLatency; + LatencySample getLatestCipherKeysLatency; + + CounterSet(CounterCollection& cc, std::string name); + }; + + static CounterSet& counters(UsageType t) { + ASSERT(t < UsageType::MAX); + return getInstance()->counterSets[int(t)]; + } + +private: + BlobCipherMetrics(); + + CounterCollection cc; + Future traceFuture; + +public: + Counter cipherKeyCacheHit; + Counter cipherKeyCacheMiss; + Counter cipherKeyCacheExpired; + Counter latestCipherKeyCacheHit; + Counter latestCipherKeyCacheMiss; + Counter latestCipherKeyCacheNeedsRefresh; + LatencySample getCipherKeysLatency; + LatencySample getLatestCipherKeysLatency; + std::array counterSets; +}; + // Encryption operations buffer management // Approach limits number of copies needed during encryption or decryption operations. // For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture @@ -61,7 +117,7 @@ class EncryptBuf : public ReferenceCounted, NonCopyable { public: EncryptBuf(int size, Arena& arena) : allocSize(size), logicalSize(size) { if (size > 0) { - buffer = new (arena) uint8_t[size]; + buffer = new (arena) uint8_t[size](); } else { buffer = nullptr; } @@ -85,9 +141,9 @@ private: #pragma pack(push, 1) // exact fit - no padding struct BlobCipherDetails { // Encryption domain boundary identifier. - EncryptCipherDomainId encryptDomainId = ENCRYPT_INVALID_DOMAIN_ID; + EncryptCipherDomainId encryptDomainId = INVALID_ENCRYPT_DOMAIN_ID; // BaseCipher encryption key identifier - EncryptCipherBaseKeyId baseCipherId = ENCRYPT_INVALID_CIPHER_KEY_ID; + EncryptCipherBaseKeyId baseCipherId = INVALID_ENCRYPT_CIPHER_KEY_ID; // Random salt EncryptCipherRandomSalt salt{}; @@ -137,7 +193,8 @@ typedef struct BlobCipherEncryptHeader { uint8_t headerVersion{}; uint8_t encryptMode{}; uint8_t authTokenMode{}; - uint8_t _reserved[4]{}; + uint8_t authTokenAlgo{}; + uint8_t _reserved[3]{}; } flags; uint64_t _padding{}; }; @@ -168,12 +225,12 @@ typedef struct BlobCipherEncryptHeader { struct { // Cipher text authentication token - uint8_t cipherTextAuthToken[AUTH_TOKEN_SIZE]{}; - uint8_t headerAuthToken[AUTH_TOKEN_SIZE]{}; + uint8_t cipherTextAuthToken[AUTH_TOKEN_MAX_SIZE]{}; + uint8_t headerAuthToken[AUTH_TOKEN_MAX_SIZE]{}; } multiAuthTokens; struct { - uint8_t authToken[AUTH_TOKEN_SIZE]{}; - uint8_t _reserved[AUTH_TOKEN_SIZE]{}; + uint8_t authToken[AUTH_TOKEN_MAX_SIZE]{}; + uint8_t _reserved[AUTH_TOKEN_MAX_SIZE]{}; } singleAuthToken; }; @@ -254,14 +311,14 @@ public: if (refreshAtTS == std::numeric_limits::max()) { return false; } - return now() >= refreshAtTS ? true : false; + return now() + INetwork::TIME_EPS >= refreshAtTS ? true : false; } inline bool isExpired() { if (expireAtTS == std::numeric_limits::max()) { return false; } - return now() >= expireAtTS ? true : false; + return now() + INetwork::TIME_EPS >= expireAtTS ? true : false; } void reset(); @@ -324,8 +381,7 @@ using BlobCipherKeyIdCacheMapCItr = struct BlobCipherKeyIdCache : ReferenceCounted { public: - BlobCipherKeyIdCache(); - explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId); + explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId, size_t* sizeStat); BlobCipherKeyIdCacheKey getCacheKey(const EncryptCipherBaseKeyId& baseCipherId, const EncryptCipherRandomSalt& salt); @@ -378,11 +434,15 @@ public: // API returns list of all 'cached' cipherKeys std::vector> getAllCipherKeys(); + // Return number of cipher keys in the cahce. + size_t getSize() const { return keyIdCache.size(); } + private: EncryptCipherDomainId domainId; BlobCipherKeyIdCacheMap keyIdCache; Optional latestBaseCipherKeyId; Optional latestRandomSalt; + size_t* sizeStat; // pointer to the outer BlobCipherKeyCache size count. }; using BlobCipherDomainCacheMap = std::unordered_map>; @@ -447,10 +507,19 @@ public: // API enables dropping all 'cached' cipherKeys for a given encryption domain Id. // Useful to cleanup cache if an encryption domain gets removed/destroyed etc. - void resetEncryptDomainId(const EncryptCipherDomainId domainId); + // Total number of cipher keys in the cache. + size_t getSize() const { return size; } + static Reference getInstance() { + static bool cleanupRegistered = false; + if (!cleanupRegistered) { + // We try to avoid cipher keys appear in core dumps, so we clean them up before crash. + // TODO(yiwu): use of MADV_DONTDUMP instead of the crash handler. + registerCrashHandlerCallback(BlobCipherKeyCache::cleanup); + cleanupRegistered = true; + } if (g_network->isSimulated()) { return FlowSingleton::getInstance( []() { return makeReference(g_network->isSimulated()); }); @@ -466,6 +535,7 @@ public: private: BlobCipherDomainCacheMap domainCacheMap; + size_t size = 0; BlobCipherKeyCache() {} }; @@ -483,17 +553,30 @@ public: Reference hCipherKey, const uint8_t* iv, const int ivLen, - const EncryptAuthTokenMode mode); + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType); EncryptBlobCipherAes265Ctr(Reference tCipherKey, Reference hCipherKey, - const EncryptAuthTokenMode mode); + const uint8_t* iv, + const int ivLen, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType); + EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType); + EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType); ~EncryptBlobCipherAes265Ctr(); Reference encrypt(const uint8_t* plaintext, const int plaintextLen, BlobCipherEncryptHeader* header, Arena&); - Standalone encryptBlobGranuleChunk(const uint8_t* plaintext, const int plaintextLen); private: EVP_CIPHER_CTX* ctx; @@ -501,6 +584,8 @@ private: Reference headerCipherKey; EncryptAuthTokenMode authTokenMode; uint8_t iv[AES_256_IV_LENGTH]; + BlobCipherMetrics::UsageType usageType; + EncryptAuthTokenAlgo authTokenAlgo; void init(); }; @@ -512,7 +597,8 @@ class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted tCipherKey, Reference hCipherKey, - const uint8_t* iv); + const uint8_t* iv, + BlobCipherMetrics::UsageType usageType); ~DecryptBlobCipherAes256Ctr(); Reference decrypt(const uint8_t* ciphertext, @@ -531,22 +617,20 @@ private: Reference headerCipherKey; bool headerAuthTokenValidationDone; bool authTokensValidationDone; + BlobCipherMetrics::UsageType usageType; void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header); void verifyAuthTokens(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena); void verifyHeaderSingleAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena); void verifyHeaderMultiAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena); }; @@ -555,16 +639,32 @@ public: HmacSha256DigestGen(const unsigned char* key, size_t len); ~HmacSha256DigestGen(); HMAC_CTX* getCtx() const { return ctx; } - StringRef digest(unsigned char const* data, size_t len, Arena&); + unsigned int digest(const std::vector>& payload, + unsigned char* buf, + unsigned int bufLen); private: HMAC_CTX* ctx; }; -StringRef computeAuthToken(const uint8_t* payload, - const int payloadLen, - const uint8_t* key, - const int keyLen, - Arena& arena); +class Aes256CmacDigestGen final : NonCopyable { +public: + Aes256CmacDigestGen(const unsigned char* key, size_t len); + ~Aes256CmacDigestGen(); + CMAC_CTX* getCtx() const { return ctx; } + size_t digest(const std::vector>& payload, uint8_t* digest, int digestlen); -#endif // FLOW_BLOB_CIPHER_H \ No newline at end of file +private: + CMAC_CTX* ctx; +}; + +void computeAuthToken(const std::vector>& payload, + const uint8_t* key, + const int keyLen, + unsigned char* digestBuf, + const EncryptAuthTokenAlgo algo, + unsigned int digestMaxBufSz); + +EncryptAuthTokenMode getEncryptAuthTokenMode(const EncryptAuthTokenMode mode); + +#endif // FDBCLIENT_BLOB_CIPHER_H diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index 698a54886a..6f530f020d 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -22,10 +22,10 @@ #define FDBCLIENT_BLOBGRANULECOMMON_H #pragma once +#include "fdbclient/BlobCipher.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" -#include "flow/BlobCipher.h" #include "flow/EncryptUtils.h" #include "flow/IRandom.h" #include "flow/serialize.h" @@ -35,7 +35,6 @@ #define BG_ENCRYPT_COMPRESS_DEBUG false // file format of actual blob files -// FIXME: use VecSerStrategy::String serialization for this struct GranuleSnapshot : VectorRef { constexpr static FileIdentifier file_identifier = 1300395; @@ -56,6 +55,13 @@ struct GranuleDeltas : VectorRef { } }; +struct GranuleMaterializeStats { + int64_t inputBytes; + int64_t outputBytes; + + GranuleMaterializeStats() : inputBytes(0), outputBytes(0) {} +}; + struct BlobGranuleCipherKeysMeta { EncryptCipherDomainId textDomainId; EncryptCipherBaseKeyId textBaseCipherId; @@ -234,6 +240,22 @@ struct BlobGranuleChunkRef { } }; +struct BlobGranuleSummaryRef { + constexpr static FileIdentifier file_identifier = 9774587; + KeyRangeRef keyRange; + Version snapshotVersion; + int64_t snapshotSize; + Version deltaVersion; + int64_t deltaSize; + + template + void serialize(Ar& ar) { + serializer(ar, keyRange, snapshotVersion, snapshotSize, deltaVersion, deltaSize); + } +}; + +BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk); + enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done = 3 }; // Boundary metadata for each range indexed by the beginning of the range. @@ -261,4 +283,26 @@ struct BlobGranuleHistoryValue { } }; +struct GranuleHistory { + KeyRange range; + Version version; + Standalone value; + + GranuleHistory() {} + + GranuleHistory(KeyRange range, Version version, Standalone value) + : range(range), version(version), value(value) {} +}; + +// A manifest to assist full fdb restore from blob granule files +struct BlobManifest { + constexpr static FileIdentifier file_identifier = 298872; + VectorRef rows; + + template + void serialize(Ar& ar) { + serializer(ar, rows); + } +}; + #endif diff --git a/fdbclient/include/fdbclient/BlobGranuleFiles.h b/fdbclient/include/fdbclient/BlobGranuleFiles.h index f6a159a7fa..23faff3d03 100644 --- a/fdbclient/include/fdbclient/BlobGranuleFiles.h +++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h @@ -43,7 +43,8 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone readBlobGranules(BlobGranuleFileRequest request, Reference bstore, PromiseStream results); +bool isRangeFullyCovered(KeyRange range, Standalone> blobChunks); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h index 7c6fd91fe9..9539db459b 100644 --- a/fdbclient/include/fdbclient/BlobWorkerCommon.h +++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h @@ -30,7 +30,7 @@ struct BlobWorkerStats { Counter deltaBytesWritten, snapshotBytesWritten; Counter bytesReadFromFDBForInitialSnapshot; Counter bytesReadFromS3ForCompaction; - Counter rangeAssignmentRequests, readRequests; + Counter rangeAssignmentRequests, readRequests, summaryReads; Counter wrongShardServer; Counter changeFeedInputBytes; Counter readReqTotalFilesReturned; @@ -44,12 +44,14 @@ struct BlobWorkerStats { Counter compressionBytesRaw; Counter compressionBytesFinal; Counter fullRejections; + Counter forceFlushCleanups; int numRangesAssigned; int mutationBytesBuffered; int activeReadRequests; int granulesPendingSplitCheck; Version minimumCFVersion; + Version cfVersionLag; int notAtLatestChangeFeeds; int64_t lastResidentMemory; int64_t estimatedMaxResidentMemory; @@ -74,14 +76,15 @@ struct BlobWorkerStats { bytesReadFromFDBForInitialSnapshot("BytesReadFromFDBForInitialSnapshot", cc), bytesReadFromS3ForCompaction("BytesReadFromS3ForCompaction", cc), rangeAssignmentRequests("RangeAssignmentRequests", cc), readRequests("ReadRequests", cc), - wrongShardServer("WrongShardServer", cc), changeFeedInputBytes("ChangeFeedInputBytes", cc), - readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), + summaryReads("SummaryReads", cc), wrongShardServer("WrongShardServer", cc), + changeFeedInputBytes("ChangeFeedInputBytes", cc), readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc), granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc), readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc), flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc), - compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), numRangesAssigned(0), - mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), + compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), + forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0), + activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) { specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; }); @@ -89,6 +92,7 @@ struct BlobWorkerStats { specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; }); specialCounter(cc, "GranulesPendingSplitCheck", [this]() { return this->granulesPendingSplitCheck; }); specialCounter(cc, "MinimumChangeFeedVersion", [this]() { return this->minimumCFVersion; }); + specialCounter(cc, "CFVersionLag", [this]() { return this->cfVersionLag; }); specialCounter(cc, "NotAtLatestChangeFeeds", [this]() { return this->notAtLatestChangeFeeds; }); specialCounter(cc, "LastResidentMemory", [this]() { return this->lastResidentMemory; }); specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; }); diff --git a/fdbclient/include/fdbclient/BlobWorkerInterface.h b/fdbclient/include/fdbclient/BlobWorkerInterface.h index b7313db1fc..69d938300e 100644 --- a/fdbclient/include/fdbclient/BlobWorkerInterface.h +++ b/fdbclient/include/fdbclient/BlobWorkerInterface.h @@ -30,7 +30,6 @@ struct BlobWorkerInterface { constexpr static FileIdentifier file_identifier = 8358753; - // TODO: mimic what StorageServerInterface does with sequential endpoint IDs RequestStream> waitFailure; PublicRequestStream blobGranuleFileRequest; RequestStream assignBlobRangeRequest; @@ -114,6 +113,7 @@ struct BlobGranuleFileRequest { Version readVersion; bool canCollapseBegin = true; TenantInfo tenantInfo; + bool summarize = false; ReplyPromise reply; BlobGranuleFileRequest() {} @@ -122,7 +122,7 @@ struct BlobGranuleFileRequest { template void serialize(Ar& ar) { - serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, reply, arena); + serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, summarize, reply, arena); } }; diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 6075b486cc..61f0359539 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -78,6 +78,7 @@ public: int64_t CHANGE_FEED_CACHE_SIZE; double CHANGE_FEED_POP_TIMEOUT; int64_t CHANGE_FEED_STREAM_MIN_BYTES; + double CHANGE_FEED_START_INTERVAL; int MAX_BATCH_SIZE; double GRV_BATCH_TIMEOUT; @@ -253,12 +254,13 @@ public: int MAX_TRANSACTION_TAG_LENGTH; int MAX_TAGS_PER_TRANSACTION; int COMMIT_SAMPLE_COST; // The expectation of sampling is every COMMIT_SAMPLE_COST sample once - int WRITE_COST_BYTE_FACTOR; int INCOMPLETE_SHARD_PLUS; // The size of (possible) incomplete shard when estimate clear range double READ_TAG_SAMPLE_RATE; // Communicated to clients from cluster double TAG_THROTTLE_SMOOTHING_WINDOW; double TAG_THROTTLE_RECHECK_INTERVAL; double TAG_THROTTLE_EXPIRATION_INTERVAL; + int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations + int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations // busyness reporting double BUSYNESS_SPIKE_START_THRESHOLD; @@ -283,6 +285,10 @@ public: int METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK; double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY; double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT; + int TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantEntryCache is refreshed + + // Encryption-at-rest + bool ENABLE_ENCRYPTION_CPU_TIME_LOGGING; ClientKnobs(Randomize randomize); void initialize(Randomize randomize); diff --git a/fdbclient/include/fdbclient/ClientVersion.h b/fdbclient/include/fdbclient/ClientVersion.h index fe3068affc..c395a69cc2 100644 --- a/fdbclient/include/fdbclient/ClientVersion.h +++ b/fdbclient/include/fdbclient/ClientVersion.h @@ -37,7 +37,7 @@ struct ClientVersionRef { ClientVersionRef(StringRef clientVersion, StringRef sourceVersion, StringRef protocolVersion) : clientVersion(clientVersion), sourceVersion(sourceVersion), protocolVersion(protocolVersion) {} ClientVersionRef(StringRef versionString) { - std::vector parts = versionString.splitAny(LiteralStringRef(",")); + std::vector parts = versionString.splitAny(","_sr); if (parts.size() != 3) { initUnknown(); return; @@ -48,9 +48,9 @@ struct ClientVersionRef { } void initUnknown() { - clientVersion = LiteralStringRef("Unknown"); - sourceVersion = LiteralStringRef("Unknown"); - protocolVersion = LiteralStringRef("Unknown"); + clientVersion = "Unknown"_sr; + sourceVersion = "Unknown"_sr; + protocolVersion = "Unknown"_sr; } template diff --git a/fdbclient/include/fdbclient/ClusterInterface.h b/fdbclient/include/fdbclient/ClusterInterface.h index a88a721757..a4e3da44f3 100644 --- a/fdbclient/include/fdbclient/ClusterInterface.h +++ b/fdbclient/include/fdbclient/ClusterInterface.h @@ -114,7 +114,7 @@ struct OpenDatabaseRequest { template void serialize(Ar& ar) { - serializer(ar, samples); + serializer(ar, count, samples); } // Merges a set of Samples into *this diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 93247ec678..1614aeacf0 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -25,16 +25,17 @@ #include #include -#include "fdbclient/FDBTypes.h" -#include "fdbclient/StorageServerInterface.h" #include "fdbclient/CommitTransaction.h" -#include "fdbclient/TagThrottle.actor.h" +#include "fdbclient/EncryptKeyProxyInterface.h" +#include "fdbclient/FDBTypes.h" #include "fdbclient/GlobalConfig.h" +#include "fdbclient/GrvProxyInterface.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/TagThrottle.actor.h" #include "fdbclient/VersionVector.h" #include "fdbrpc/Stats.h" #include "fdbrpc/TimedRequest.h" -#include "GrvProxyInterface.h" struct CommitProxyInterface { constexpr static FileIdentifier file_identifier = 8954922; @@ -118,6 +119,7 @@ struct ClientDBInfo { std::vector history; UID clusterId; bool isEncryptionEnabled = false; + Optional encryptKeyProxy; TenantMode tenantMode; ClusterType clusterType = ClusterType::STANDALONE; @@ -141,6 +143,7 @@ struct ClientDBInfo { history, tenantMode, isEncryptionEnabled, + encryptKeyProxy, clusterId, clusterType, metaclusterName); @@ -194,7 +197,7 @@ struct CommitTransactionRequest : TimedRequest { template void serialize(Ar& ar) { serializer( - ar, transaction, reply, arena, flags, debugID, commitCostEstimation, tagSet, spanContext, tenantInfo); + ar, transaction, reply, flags, debugID, commitCostEstimation, tagSet, spanContext, tenantInfo, arena); } }; @@ -336,7 +339,7 @@ struct GetKeyServerLocationsReply { template void serialize(Ar& ar) { - serializer(ar, results, resultsTssMapping, tenantEntry, arena, resultsTagMapping); + serializer(ar, results, resultsTssMapping, tenantEntry, resultsTagMapping, arena); } }; @@ -540,7 +543,7 @@ struct ProxySnapRequest { template void serialize(Ar& ar) { - serializer(ar, snapPayload, snapUID, reply, arena, debugID); + serializer(ar, snapPayload, snapUID, reply, debugID, arena); } }; diff --git a/fdbclient/include/fdbclient/CommitTransaction.h b/fdbclient/include/fdbclient/CommitTransaction.h index dc26df4fa4..2c7c62659b 100644 --- a/fdbclient/include/fdbclient/CommitTransaction.h +++ b/fdbclient/include/fdbclient/CommitTransaction.h @@ -22,10 +22,13 @@ #define FLOW_FDBCLIENT_COMMITTRANSACTION_H #pragma once +#include "fdbclient/BlobCipher.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/GetEncryptCipherKeys.actor.h" #include "fdbclient/Knobs.h" #include "fdbclient/Tracing.h" -#include "flow/BlobCipher.h" +#include "flow/EncryptUtils.h" +#include "flow/Knobs.h" // The versioned message has wire format : -1, version, messages static const int32_t VERSION_HEADER = -1; @@ -141,8 +144,9 @@ struct MutationRef { MutationRef encrypt(const std::unordered_map>& cipherKeys, const EncryptCipherDomainId& domainId, - Arena& arena) const { - ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID); + Arena& arena, + BlobCipherMetrics::UsageType usageType) const { + ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID); auto textCipherItr = cipherKeys.find(domainId); auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID); ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); @@ -151,11 +155,13 @@ struct MutationRef { deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); BinaryWriter bw(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); bw << *this; - EncryptBlobCipherAes265Ctr cipher(textCipherItr->second, - headerCipherItr->second, - iv, - AES_256_IV_LENGTH, - ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + EncryptBlobCipherAes265Ctr cipher( + textCipherItr->second, + headerCipherItr->second, + iv, + AES_256_IV_LENGTH, + getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), + usageType); BlobCipherEncryptHeader* header = new (arena) BlobCipherEncryptHeader; StringRef headerRef(reinterpret_cast(header), sizeof(BlobCipherEncryptHeader)); StringRef payload = @@ -164,19 +170,17 @@ struct MutationRef { } MutationRef encryptMetadata(const std::unordered_map>& cipherKeys, - Arena& arena) const { - return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena); + Arena& arena, + BlobCipherMetrics::UsageType usageType) const { + return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena, usageType); } - MutationRef decrypt(const std::unordered_map>& cipherKeys, + MutationRef decrypt(TextAndHeaderCipherKeys cipherKeys, Arena& arena, + BlobCipherMetrics::UsageType usageType, StringRef* buf = nullptr) const { const BlobCipherEncryptHeader* header = encryptionHeader(); - auto textCipherItr = cipherKeys.find(header->cipherTextDetails); - auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails); - ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); - ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); - DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, header->iv); + DecryptBlobCipherAes256Ctr cipher(cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header->iv, usageType); StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef(); if (buf != nullptr) { *buf = plaintext; @@ -187,6 +191,21 @@ struct MutationRef { return mutation; } + MutationRef decrypt(const std::unordered_map>& cipherKeys, + Arena& arena, + BlobCipherMetrics::UsageType usageType, + StringRef* buf = nullptr) const { + const BlobCipherEncryptHeader* header = encryptionHeader(); + auto textCipherItr = cipherKeys.find(header->cipherTextDetails); + auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + TextAndHeaderCipherKeys textAndHeaderKeys; + textAndHeaderKeys.cipherHeaderKey = headerCipherItr->second; + textAndHeaderKeys.cipherTextKey = textCipherItr->second; + return decrypt(textAndHeaderKeys, arena, usageType, buf); + } + // These masks define which mutation types have particular properties (they are used to implement // isSingleKeyMutation() etc) enum { @@ -249,6 +268,11 @@ struct CommitTransactionRef { VectorRef read_conflict_ranges; VectorRef write_conflict_ranges; VectorRef mutations; // metadata mutations + // encryptedMutations should be a 1-1 corespondence with mutations field above. That is either + // encryptedMutations.size() == 0 or encryptedMutations.size() == mutations.size() and encryptedMutations[i] = + // mutations[i].encrypt(). Currently this field is not serialized so clients should NOT set this field during a + // usual commit path. It is currently only used during backup mutation log restores. + VectorRef> encryptedMutations; Version read_snapshot = 0; bool report_conflicting_keys = false; bool lock_aware = false; // set when metadata mutations are present @@ -271,7 +295,19 @@ struct CommitTransactionRef { serializer(ar, report_conflicting_keys); } if (ar.protocolVersion().hasResolverPrivateMutations()) { - serializer(ar, lock_aware, spanContext); + serializer(ar, lock_aware); + if (!ar.protocolVersion().hasOTELSpanContext()) { + Optional context; + serializer(ar, context); + if (context.present()) { + SpanContext res; + res.traceID = context.get(); + spanContext = res; + } + } + } + if (ar.protocolVersion().hasOTELSpanContext()) { + serializer(ar, spanContext); } } } diff --git a/fdbclient/include/fdbclient/ConfigKnobs.h b/fdbclient/include/fdbclient/ConfigKnobs.h index 536bca16f3..168e2fed16 100644 --- a/fdbclient/include/fdbclient/ConfigKnobs.h +++ b/fdbclient/include/fdbclient/ConfigKnobs.h @@ -25,6 +25,8 @@ #include "fdbclient/FDBTypes.h" +typedef uint64_t CoordinatorsHash; + /* * KnobValueRefs are stored in the configuration database, and in local configuration files. They are created from * ParsedKnobValue objects, so it is assumed that the value type is correct for the corresponding knob name diff --git a/fdbclient/include/fdbclient/ConfigTransactionInterface.h b/fdbclient/include/fdbclient/ConfigTransactionInterface.h index 98b65e4c4b..dad60f2d04 100644 --- a/fdbclient/include/fdbclient/ConfigTransactionInterface.h +++ b/fdbclient/include/fdbclient/ConfigTransactionInterface.h @@ -65,16 +65,18 @@ struct ConfigTransactionGetGenerationReply { struct ConfigTransactionGetGenerationRequest { static constexpr FileIdentifier file_identifier = 138941; + CoordinatorsHash coordinatorsHash{ 0 }; // A hint to catch up lagging nodes: Optional lastSeenLiveVersion; ReplyPromise reply; ConfigTransactionGetGenerationRequest() = default; - explicit ConfigTransactionGetGenerationRequest(Optional const& lastSeenLiveVersion) - : lastSeenLiveVersion(lastSeenLiveVersion) {} + explicit ConfigTransactionGetGenerationRequest(CoordinatorsHash coordinatorsHash, + Optional const& lastSeenLiveVersion) + : coordinatorsHash(coordinatorsHash), lastSeenLiveVersion(lastSeenLiveVersion) {} template void serialize(Ar& ar) { - serializer(ar, lastSeenLiveVersion, reply); + serializer(ar, coordinatorsHash, lastSeenLiveVersion, reply); } }; @@ -92,39 +94,43 @@ struct ConfigTransactionGetReply { struct ConfigTransactionGetRequest { static constexpr FileIdentifier file_identifier = 923040; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation; ConfigKey key; ReplyPromise reply; ConfigTransactionGetRequest() = default; - explicit ConfigTransactionGetRequest(ConfigGeneration generation, ConfigKey key) - : generation(generation), key(key) {} + explicit ConfigTransactionGetRequest(CoordinatorsHash coordinatorsHash, ConfigGeneration generation, ConfigKey key) + : coordinatorsHash(coordinatorsHash), generation(generation), key(key) {} template void serialize(Ar& ar) { - serializer(ar, generation, key, reply); + serializer(ar, coordinatorsHash, generation, key, reply); } }; struct ConfigTransactionCommitRequest { static constexpr FileIdentifier file_identifier = 103841; Arena arena; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation{ ::invalidVersion, ::invalidVersion }; VectorRef mutations; ConfigCommitAnnotationRef annotation; ReplyPromise reply; ConfigTransactionCommitRequest() = default; - explicit ConfigTransactionCommitRequest(ConfigGeneration generation, + explicit ConfigTransactionCommitRequest(CoordinatorsHash coordinatorsHash, + ConfigGeneration generation, VectorRef mutations, ConfigCommitAnnotationRef annotation) - : generation(generation), mutations(arena, mutations), annotation(arena, annotation) {} + : coordinatorsHash(coordinatorsHash), generation(generation), mutations(arena, mutations), + annotation(arena, annotation) {} size_t expectedSize() const { return mutations.expectedSize() + annotation.expectedSize(); } template void serialize(Ar& ar) { - serializer(ar, arena, generation, mutations, annotation, reply); + serializer(ar, coordinatorsHash, generation, mutations, annotation, reply, arena); } }; @@ -144,15 +150,17 @@ struct ConfigTransactionGetConfigClassesReply { struct ConfigTransactionGetConfigClassesRequest { static constexpr FileIdentifier file_identifier = 7163400; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation; ReplyPromise reply; ConfigTransactionGetConfigClassesRequest() = default; - explicit ConfigTransactionGetConfigClassesRequest(ConfigGeneration generation) : generation(generation) {} + explicit ConfigTransactionGetConfigClassesRequest(CoordinatorsHash coordinatorsHash, ConfigGeneration generation) + : coordinatorsHash(coordinatorsHash), generation(generation) {} template void serialize(Ar& ar) { - serializer(ar, generation); + serializer(ar, coordinatorsHash, generation); } }; @@ -171,17 +179,20 @@ struct ConfigTransactionGetKnobsReply { struct ConfigTransactionGetKnobsRequest { static constexpr FileIdentifier file_identifier = 987410; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation; Optional configClass; ReplyPromise reply; ConfigTransactionGetKnobsRequest() = default; - explicit ConfigTransactionGetKnobsRequest(ConfigGeneration generation, Optional configClass) - : generation(generation), configClass(configClass) {} + explicit ConfigTransactionGetKnobsRequest(CoordinatorsHash coordinatorsHash, + ConfigGeneration generation, + Optional configClass) + : coordinatorsHash(coordinatorsHash), generation(generation), configClass(configClass) {} template void serialize(Ar& ar) { - serializer(ar, generation, configClass, reply); + serializer(ar, coordinatorsHash, generation, configClass, reply); } }; diff --git a/fdbclient/include/fdbclient/ConsistencyScanInterface.actor.h b/fdbclient/include/fdbclient/ConsistencyScanInterface.actor.h new file mode 100644 index 0000000000..7d6529ced0 --- /dev/null +++ b/fdbclient/include/fdbclient/ConsistencyScanInterface.actor.h @@ -0,0 +1,196 @@ +/* + * ConsistencyScanInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_G_H) +#define FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_G_H +#include "fdbclient/ConsistencyScanInterface.actor.g.h" +#elif !defined(FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_H) +#define FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_H + +#include "fdbclient/CommitProxyInterface.h" +#include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/RunTransaction.actor.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbrpc/Locality.h" + +#include "flow/actorcompiler.h" // must be last include + +struct ConsistencyScanInterface { + constexpr static FileIdentifier file_identifier = 4983265; + RequestStream> waitFailure; + RequestStream haltConsistencyScan; + struct LocalityData locality; + UID myId; + + ConsistencyScanInterface() {} + explicit ConsistencyScanInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {} + + void initEndpoints() {} + UID id() const { return myId; } + NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); } + bool operator==(const ConsistencyScanInterface& r) const { return id() == r.id(); } + bool operator!=(const ConsistencyScanInterface& r) const { return !(*this == r); } + + template + void serialize(Archive& ar) { + serializer(ar, waitFailure, haltConsistencyScan, locality, myId); + } +}; + +struct HaltConsistencyScanRequest { + constexpr static FileIdentifier file_identifier = 2323417; + UID requesterID; + ReplyPromise reply; + + HaltConsistencyScanRequest() {} + explicit HaltConsistencyScanRequest(UID uid) : requesterID(uid) {} + + template + void serialize(Ar& ar) { + serializer(ar, requesterID, reply); + } +}; + +// consistency scan configuration and metrics +struct ConsistencyScanInfo { + constexpr static FileIdentifier file_identifier = 732125; + bool consistency_scan_enabled = false; + bool restart = false; + int64_t max_rate = 0; + int64_t target_interval = CLIENT_KNOBS->CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME; + int64_t bytes_read_prev_round = 0; + KeyRef progress_key = KeyRef(); + + // Round Metrics - one round of complete validation across all SSs + // Start and finish are in epoch seconds + double last_round_start = 0; + double last_round_finish = 0; + TimerSmoother smoothed_round_duration; + int finished_rounds = 0; + + ConsistencyScanInfo() : smoothed_round_duration(20.0 * 60) {} + ConsistencyScanInfo(bool enabled, bool r, uint64_t rate, uint64_t interval) + : consistency_scan_enabled(enabled), restart(r), max_rate(rate), target_interval(interval), + smoothed_round_duration(20.0 * 60) {} + + template + void serialize(Ar& ar) { + double round_total; + if (!ar.isDeserializing) { + round_total = smoothed_round_duration.getTotal(); + } + serializer(ar, + consistency_scan_enabled, + restart, + max_rate, + target_interval, + bytes_read_prev_round, + last_round_start, + last_round_finish, + round_total, + finished_rounds); + if (ar.isDeserializing) { + smoothed_round_duration.reset(round_total); + } + } + + static Future setInfo(Reference tr, ConsistencyScanInfo info) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->set(consistencyScanInfoKey, ObjectWriter::toValue(info, IncludeVersion())); + return Void(); + } + + static Future setInfo(Database cx, ConsistencyScanInfo info) { + return runRYWTransaction( + cx, [=](Reference tr) -> Future { return setInfo(tr, info); }); + } + + static Future> getInfo(Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + return tr->get(consistencyScanInfoKey); + } + + static Future> getInfo(Database cx) { + return runRYWTransaction( + cx, [=](Reference tr) -> Future> { return getInfo(tr); }); + } + + StatusObject toJSON() const { + StatusObject result; + result["consistency_scan_enabled"] = consistency_scan_enabled; + result["restart"] = restart; + result["max_rate"] = max_rate; + result["target_interval"] = target_interval; + result["bytes_read_prev_round"] = bytes_read_prev_round; + result["last_round_start_datetime"] = epochsToGMTString(last_round_start); + result["last_round_finish_datetime"] = epochsToGMTString(last_round_finish); + result["last_round_start_timestamp"] = last_round_start; + result["last_round_finish_timestamp"] = last_round_finish; + result["smoothed_round_seconds"] = smoothed_round_duration.smoothTotal(); + result["finished_rounds"] = finished_rounds; + return result; + } + + std::string toString() const { + return format("consistency_scan_enabled = %d, restart = %d, max_rate = %ld, target_interval = %ld", + consistency_scan_enabled, + restart, + max_rate, + target_interval); + } +}; + +ACTOR Future getVersion(Database cx); +ACTOR Future getKeyServers( + Database cx, + Promise>>> keyServersPromise, + KeyRangeRef kr, + bool performQuiescentChecks); +ACTOR Future getKeyLocations(Database cx, + std::vector>> shards, + Promise>> keyLocationPromise, + bool performQuiescentChecks); +ACTOR Future checkDataConsistency(Database cx, + VectorRef keyLocations, + DatabaseConfiguration configuration, + std::map tssMapping, + bool performQuiescentChecks, + bool performTSSCheck, + bool firstClient, + bool failureIsError, + int clientId, + int clientCount, + bool distributed, + bool shuffleShards, + int shardSampleFactor, + int64_t sharedRandomNumber, + int64_t repetitions, + int64_t* bytesReadInPreviousRound, + int restart, + int64_t maxRate, + int64_t targetInterval, + KeyRef progressKey); + +#include "flow/unactorcompiler.h" + +#endif // FDBCLIENT_CONSISTENCYSCANINTERFACE_H \ No newline at end of file diff --git a/fdbclient/include/fdbclient/CoordinationInterface.h b/fdbclient/include/fdbclient/CoordinationInterface.h index cc8042976d..c5d53d9ad7 100644 --- a/fdbclient/include/fdbclient/CoordinationInterface.h +++ b/fdbclient/include/fdbclient/CoordinationInterface.h @@ -86,6 +86,8 @@ public: std::vector coords; std::vector hostnames; + size_t getNumberOfCoordinators() const { return coords.size() + hostnames.size(); } + bool operator==(const ClusterConnectionString& other) const noexcept { return key == other.key && keyDesc == other.keyDesc && coords == other.coords && hostnames == other.hostnames; } diff --git a/fdbclient/include/fdbclient/DatabaseConfiguration.h b/fdbclient/include/fdbclient/DatabaseConfiguration.h index 363b48e4b6..f73cd99c9d 100644 --- a/fdbclient/include/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/include/fdbclient/DatabaseConfiguration.h @@ -256,6 +256,8 @@ struct DatabaseConfiguration { bool blobGranulesEnabled; TenantMode tenantMode; + EncryptionAtRestMode encryptionAtRestMode; + // Excluded servers (no state should be here) bool isExcludedServer(NetworkAddressList) const; bool isExcludedLocality(const LocalityData& locality) const; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index c3734e2889..6b8db6953b 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -21,6 +21,7 @@ #ifndef DatabaseContext_h #define DatabaseContext_h #include "fdbclient/Notified.h" +#include "flow/ApiVersion.h" #include "flow/FastAlloc.h" #include "flow/FastRef.h" #include "fdbclient/GlobalConfig.actor.h" @@ -168,10 +169,11 @@ struct ChangeFeedStorageData : ReferenceCounted { Future updater; NotifiedVersion version; NotifiedVersion desired; - Promise destroyed; UID interfToken; + DatabaseContext* context; + double created; - ~ChangeFeedStorageData() { destroyed.send(Void()); } + ~ChangeFeedStorageData(); }; struct ChangeFeedData : ReferenceCounted { @@ -191,6 +193,7 @@ struct ChangeFeedData : ReferenceCounted { Version endVersion = invalidVersion; Version popVersion = invalidVersion; // like TLog pop version, set by SS and client can check it to see if they missed data + double created = 0; explicit ChangeFeedData(DatabaseContext* context = nullptr); ~ChangeFeedData(); @@ -235,7 +238,7 @@ public: EnableLocalityLoadBalance, TaskPriority taskID = TaskPriority::DefaultEndpoint, LockAware = LockAware::False, - int apiVersion = Database::API_VERSION_LATEST, + int _apiVersion = ApiVersion::LATEST_VERSION, IsSwitchable = IsSwitchable::False); ~DatabaseContext(); @@ -251,7 +254,7 @@ public: enableLocalityLoadBalance, lockAware, internal, - apiVersion, + apiVersion.version(), switchable, defaultTenant)); cx->globalConfig->init(Reference const>(cx->clientInfo), @@ -348,7 +351,7 @@ public: } } - int apiVersionAtLeast(int minVersion) const { return apiVersion < 0 || apiVersion >= minVersion; } + int apiVersionAtLeast(int minVersion) const { return apiVersion.version() >= minVersion; } Future onConnected(); // Returns after a majority of coordination servers are available and have reported a // leader. The cluster file therefore is valid, but the database might be unavailable. @@ -406,7 +409,7 @@ public: EnableLocalityLoadBalance, LockAware, IsInternal = IsInternal::True, - int apiVersion = Database::API_VERSION_LATEST, + int _apiVersion = ApiVersion::LATEST_VERSION, IsSwitchable = IsSwitchable::False, Optional defaultTenant = Optional()); @@ -483,7 +486,7 @@ public: std::unordered_map> tssMetrics; // map from changeFeedId -> changeFeedRange std::unordered_map changeFeedCache; - std::unordered_map> changeFeedUpdaters; + std::unordered_map changeFeedUpdaters; std::map notAtLatestChangeFeeds; Reference getStorageData(StorageServerInterface interf); @@ -547,6 +550,18 @@ public: Counter transactionGrvFullBatches; Counter transactionGrvTimedOutBatches; Counter transactionCommitVersionNotFoundForSS; + Counter bgReadInputBytes; + Counter bgReadOutputBytes; + + // Change Feed metrics. Omit change feed metrics from logging if not used + bool usedAnyChangeFeeds; + CounterCollection ccFeed; + Counter feedStreamStarts; + Counter feedMergeStreamStarts; + Counter feedErrors; + Counter feedNonRetriableErrors; + Counter feedPops; + Counter feedPopsFallback; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit, bgLatencies, bgGranulesPerRequest; @@ -599,7 +614,7 @@ public: Future statusLeaderMon; double lastStatusFetch; - int apiVersion; + ApiVersion apiVersion; int mvCacheInsertLocation; std::vector>> metadataVersionCache; @@ -646,6 +661,12 @@ public: // Adds or updates the specified (UID, Tag) pair in the tag mapping. void addSSIdTagMapping(const UID& uid, const Tag& tag); + // Returns the latest commit version that mutated the specified storage server. + // @in ssid id of the storage server interface + // @out tag storage server's tag, if an entry exists for "ssid" in "ssidTagMapping" + // @out commitVersion latest commit version that mutated the storage server + void getLatestCommitVersionForSSID(const UID& ssid, Tag& tag, Version& commitVersion); + // Returns the latest commit versions that mutated the specified storage servers /// @note returns the latest commit version for a storage server only if the latest // commit version of that storage server is below the specified "readVersion". @@ -654,6 +675,14 @@ public: Reference info, VersionVector& latestCommitVersions); + // Returns the latest commit version that mutated the specified storage server. + // @note this is a lightweight version of "getLatestCommitVersions()", to be used + // when the state ("TransactionState") of the transaction that fetched the read + // version is not available. + void getLatestCommitVersion(const StorageServerInterface& ssi, + Version readVersion, + VersionVector& latestCommitVersion); + // used in template functions to create a transaction using TransactionT = ReadYourWritesTransaction; Reference createTransaction(); diff --git a/fdbserver/include/fdbserver/EncryptKeyProxyInterface.h b/fdbclient/include/fdbclient/EncryptKeyProxyInterface.h similarity index 95% rename from fdbserver/include/fdbserver/EncryptKeyProxyInterface.h rename to fdbclient/include/fdbclient/EncryptKeyProxyInterface.h index 12178b11ab..5f4d56eb96 100644 --- a/fdbserver/include/fdbserver/EncryptKeyProxyInterface.h +++ b/fdbclient/include/fdbclient/EncryptKeyProxyInterface.h @@ -132,7 +132,7 @@ struct EKPGetBaseCipherKeysByIdsReply { template void serialize(Ar& ar) { - serializer(ar, arena, baseCipherDetails, numHits, error); + serializer(ar, baseCipherDetails, numHits, error, arena); } }; @@ -144,10 +144,10 @@ struct EKPGetBaseCipherKeysRequestInfo { EncryptCipherBaseKeyId baseCipherId; // Encryption domain name - ancillairy metadata information, an encryption key should be uniquely identified by // {domainId, cipherBaseId} tuple - EncryptCipherDomainName domainName; + EncryptCipherDomainNameRef domainName; EKPGetBaseCipherKeysRequestInfo() - : domainId(ENCRYPT_INVALID_DOMAIN_ID), baseCipherId(ENCRYPT_INVALID_CIPHER_KEY_ID) {} + : domainId(INVALID_ENCRYPT_DOMAIN_ID), baseCipherId(INVALID_ENCRYPT_CIPHER_KEY_ID) {} EKPGetBaseCipherKeysRequestInfo(const EncryptCipherDomainId dId, const EncryptCipherBaseKeyId bCId, StringRef name, @@ -176,7 +176,7 @@ struct EKPGetBaseCipherKeysByIdsRequest { template void serialize(Ar& ar) { - serializer(ar, arena, baseCipherInfos, debugId, reply); + serializer(ar, baseCipherInfos, debugId, reply, arena); } }; @@ -193,7 +193,7 @@ struct EKPGetLatestBaseCipherKeysReply { template void serialize(Ar& ar) { - serializer(ar, arena, baseCipherDetails, numHits, error); + serializer(ar, baseCipherDetails, numHits, error, arena); } }; @@ -203,9 +203,9 @@ struct EKPGetLatestCipherKeysRequestInfo { EncryptCipherDomainId domainId; // Encryption domain name - ancillairy metadata information, an encryption key should be uniquely identified by // {domainId, cipherBaseId} tuple - EncryptCipherDomainName domainName; + EncryptCipherDomainNameRef domainName; - EKPGetLatestCipherKeysRequestInfo() : domainId(ENCRYPT_INVALID_DOMAIN_ID) {} + EKPGetLatestCipherKeysRequestInfo() : domainId(INVALID_ENCRYPT_DOMAIN_ID) {} EKPGetLatestCipherKeysRequestInfo(const EncryptCipherDomainId dId, StringRef name, Arena& arena) : domainId(dId), domainName(StringRef(arena, name)) {} @@ -239,7 +239,7 @@ struct EKPGetLatestBaseCipherKeysRequest { template void serialize(Ar& ar) { - serializer(ar, arena, encryptDomainInfos, debugId, reply); + serializer(ar, encryptDomainInfos, debugId, reply, arena); } }; diff --git a/fdbclient/include/fdbclient/EventTypes.actor.h b/fdbclient/include/fdbclient/EventTypes.actor.h index 39a75e09dc..dc946ce42e 100644 --- a/fdbclient/include/fdbclient/EventTypes.actor.h +++ b/fdbclient/include/fdbclient/EventTypes.actor.h @@ -26,7 +26,7 @@ #define FDBCLIENT_EVENTTYPES_ACTOR_G_H #include "fdbclient/EventTypes.actor.g.h" #elif !defined(FDBCLIENT_EVENTTYPES_ACTOR_H) -#define FDBCLIENT_EVENTTYPESS_ACTOR_H +#define FDBCLIENT_EVENTTYPES_ACTOR_H #include "flow/flow.h" #include "flow/TDMetric.actor.h" diff --git a/fdbclient/include/fdbclient/FDBAWSCredentialsProvider.h b/fdbclient/include/fdbclient/FDBAWSCredentialsProvider.h index 7831f5d792..52ea84fbf3 100644 --- a/fdbclient/include/fdbclient/FDBAWSCredentialsProvider.h +++ b/fdbclient/include/fdbclient/FDBAWSCredentialsProvider.h @@ -18,11 +18,11 @@ * limitations under the License. */ -#if (!defined FDB_AWS_CREDENTIALS_PROVIDER_H) && (defined BUILD_AWS_BACKUP) +#if (!defined FDB_AWS_CREDENTIALS_PROVIDER_H) && (defined WITH_AWS_BACKUP) #define FDB_AWS_CREDENTIALS_PROVIDER_H #pragma once -#ifdef BUILD_AWS_BACKUP +#ifdef WITH_AWS_BACKUP #include "aws/core/Aws.h" #include "aws/core/auth/AWSCredentialsProviderChain.h" diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index e1186b1549..596f6be1e2 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -41,6 +41,7 @@ typedef StringRef KeyRef; typedef StringRef ValueRef; typedef int64_t Generation; typedef UID SpanID; +typedef uint64_t CoordinatorsHash; enum { tagLocalitySpecial = -1, // tag with this locality means it is invalidTag (id=0), txsTag (id=1), or cacheTag (id=2) @@ -331,6 +332,22 @@ struct KeyRangeRef { bool empty() const { return begin == end; } bool singleKeyRange() const { return equalsKeyAfter(begin, end); } + // Return true if it's fully covered by given range list. Note that ranges should be sorted + bool isCovered(std::vector& ranges) { + ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder())); + KeyRangeRef clone(begin, end); + for (auto r : ranges) { + if (begin < r.begin) + return false; // uncovered gap between clone.begin and r.begin + if (end <= r.end) + return true; // range is fully covered + if (end > r.begin) + // {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end} + clone = KeyRangeRef(r.end, clone.end); + } + return false; + } + Standalone withPrefix(const StringRef& prefix) const { return KeyRangeRef(begin.withPrefix(prefix), end.withPrefix(prefix)); } @@ -500,10 +517,36 @@ using KeySelector = Standalone; using RangeResult = Standalone; using MappedRangeResult = Standalone; +namespace std { +template <> +struct hash { + static constexpr std::hash hashFunc{}; + std::size_t operator()(KeyRangeRef const& range) const { + std::size_t seed = 0; + boost::hash_combine(seed, hashFunc(range.begin)); + boost::hash_combine(seed, hashFunc(range.end)); + return seed; + } +}; +} // namespace std + +namespace std { +template <> +struct hash { + static constexpr std::hash hashFunc{}; + std::size_t operator()(KeyRangeRef const& range) const { + std::size_t seed = 0; + boost::hash_combine(seed, hashFunc(range.begin)); + boost::hash_combine(seed, hashFunc(range.end)); + return seed; + } +}; +} // namespace std + enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits::max() }; inline Key keyAfter(const KeyRef& key) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; Standalone r; @@ -516,7 +559,7 @@ inline Key keyAfter(const KeyRef& key) { return r; } inline KeyRef keyAfter(const KeyRef& key, Arena& arena) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; uint8_t* t = new (arena) uint8_t[key.size() + 1]; memcpy(t, key.begin(), key.size()); @@ -931,17 +974,17 @@ struct TLogVersion { } static ErrorOr FromStringRef(StringRef s) { - if (s == LiteralStringRef("2")) + if (s == "2"_sr) return V2; - if (s == LiteralStringRef("3")) + if (s == "3"_sr) return V3; - if (s == LiteralStringRef("4")) + if (s == "4"_sr) return V4; - if (s == LiteralStringRef("5")) + if (s == "5"_sr) return V5; - if (s == LiteralStringRef("6")) + if (s == "6"_sr) return V6; - if (s == LiteralStringRef("7")) + if (s == "7"_sr) return V7; return default_error_or(); } @@ -991,9 +1034,9 @@ struct TLogSpillType { } static ErrorOr FromStringRef(StringRef s) { - if (s == LiteralStringRef("1")) + if (s == "1"_sr) return VALUE; - if (s == LiteralStringRef("2")) + if (s == "2"_sr) return REFERENCE; return default_error_or(); } @@ -1392,6 +1435,60 @@ struct TenantMode { uint32_t mode; }; +struct EncryptionAtRestMode { + // These enumerated values are stored in the database configuration, so can NEVER be changed. Only add new ones + // just before END. + enum Mode { DISABLED = 0, AES_256_CTR = 1, END = 2 }; + + EncryptionAtRestMode() : mode(DISABLED) {} + EncryptionAtRestMode(Mode mode) : mode(mode) { + if ((uint32_t)mode >= END) { + this->mode = DISABLED; + } + } + operator Mode() const { return Mode(mode); } + + template + void serialize(Ar& ar) { + serializer(ar, mode); + } + + std::string toString() const { + switch (mode) { + case DISABLED: + return "disabled"; + case AES_256_CTR: + return "aes_256_ctr"; + default: + ASSERT(false); + } + return ""; + } + + Value toValue() const { return ValueRef(format("%d", (int)mode)); } + + bool isEquals(const EncryptionAtRestMode& e) const { return this->mode == e.mode; } + + bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); } + bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); } + + static EncryptionAtRestMode fromValue(Optional val) { + if (!val.present()) { + return DISABLED; + } + + // A failed parsing returns 0 (DISABLED) + int num = atoi(val.get().toString().c_str()); + if (num < 0 || num >= END) { + return DISABLED; + } + + return static_cast(num); + } + + uint32_t mode; +}; + typedef StringRef ClusterNameRef; typedef Standalone ClusterName; @@ -1454,7 +1551,8 @@ struct ReadBlobGranuleContext { int granuleParallelism = 1; }; -// Store metadata associated with each storage server. Now it only contains data be used in perpetual storage wiggle. +// Store metadata associated with each storage server. Now it only contains data be used in perpetual storage +// wiggle. struct StorageMetadataType { constexpr static FileIdentifier file_identifier = 732123; // when the SS is initialized, in epoch seconds, comes from currentTime() @@ -1515,6 +1613,42 @@ struct StorageWiggleValue { } }; +enum class ReadType { + EAGER, + FETCH, + LOW, + NORMAL, + HIGH, +}; + +FDB_DECLARE_BOOLEAN_PARAM(CacheResult); + +// store options for storage engine read +// ReadType describes the usage and priority of the read +// cacheResult determines whether the storage engine cache for this read +// consistencyCheckStartVersion indicates the consistency check which began at this version +// debugID helps to trace the path of the read +struct ReadOptions { + ReadType type; + // Once CacheResult is serializable, change type from bool to CacheResult + bool cacheResult; + Optional debugID; + Optional consistencyCheckStartVersion; + + ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){}; + + ReadOptions(Optional debugID, + ReadType type = ReadType::NORMAL, + CacheResult cache = CacheResult::False, + Optional version = Optional()) + : type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){}; + + template + void serialize(Ar& ar) { + serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion); + } +}; + // Can be used to identify types (e.g. IDatabase) that can be used to create transactions with a `createTransaction` // function template @@ -1526,4 +1660,36 @@ struct transaction_creator_traits> : st template constexpr bool is_transaction_creator = transaction_creator_traits::value; +struct Versionstamp { + Version version = invalidVersion; + uint16_t batchNumber = 0; + + bool operator==(const Versionstamp& r) const { return version == r.version && batchNumber == r.batchNumber; } + bool operator!=(const Versionstamp& r) const { return !(*this == r); } + bool operator<(const Versionstamp& r) const { + return version < r.version || (version == r.version && batchNumber < r.batchNumber); + } + bool operator>(const Versionstamp& r) const { return r < *this; } + bool operator<=(const Versionstamp& r) const { return !(*this > r); } + bool operator>=(const Versionstamp& r) const { return !(*this < r); } + + template + void serialize(Ar& ar) { + int64_t beVersion; + int16_t beBatch; + + if constexpr (!Ar::isDeserializing) { + beVersion = bigEndian64(version); + beBatch = bigEndian16(batchNumber); + } + + serializer(ar, beVersion, beBatch); + + if constexpr (Ar::isDeserializing) { + version = bigEndian64(version); + batchNumber = bigEndian16(beBatch); + } + } +}; + #endif diff --git a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h index 4c920f5da6..21ebbd3a3c 100644 --- a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h @@ -70,7 +70,8 @@ enum class ConfigurationResult { SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL, DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL, DATABASE_CREATED_WARN_SHARDED_ROCKSDB_EXPERIMENTAL, - DATABASE_IS_REGISTERED + DATABASE_IS_REGISTERED, + ENCRYPTION_AT_REST_MODE_ALREADY_SET }; enum class CoordinatorsResult { @@ -248,7 +249,7 @@ Future> getWorkers(Reference tr, // Accepts a full configuration in key/value format (from buildConfiguration) ACTOR template Future changeConfig(Reference db, std::map m, bool force) { - state StringRef initIdKey = LiteralStringRef("\xff/init_id"); + state StringRef initIdKey = "\xff/init_id"_sr; state Reference tr = db->createTransaction(); if (!m.size()) { @@ -274,6 +275,9 @@ Future changeConfig(Reference db, std::map tooLong = delay(60); @@ -501,8 +505,8 @@ Future changeConfig(Reference db, std::mapatomicOp(databaseLockedKey, BinaryWriter::toValue(locked.get(), Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), + .withPrefix("0123456789"_sr) + .withSuffix("\x00\x00\x00\x00"_sr), MutationRef::SetVersionstampedValue); } @@ -646,7 +650,7 @@ Future changeConfig(Reference db, std::vector const& modes, Optional const& conf, bool force) { - if (modes.size() && modes[0] == LiteralStringRef("auto") && conf.present()) { + if (modes.size() && modes[0] == "auto"_sr && conf.present()) { return autoConfig(db, conf.get()); } diff --git a/fdbserver/GetEncryptCipherKeys.actor.cpp b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h similarity index 63% rename from fdbserver/GetEncryptCipherKeys.actor.cpp rename to fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h index 03bfe02ecd..0f93675a6a 100644 --- a/fdbserver/GetEncryptCipherKeys.actor.cpp +++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h @@ -1,5 +1,5 @@ /* - * GetEncryptCipherKeys.actor.cpp + * GetEncryptCipherKeys.actor.h * * This source file is part of the FoundationDB open source project * @@ -17,20 +17,31 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H) +#define FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H +#include "fdbclient/GetEncryptCipherKeys.actor.g.h" +#elif !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_H) +#define FDBCLIENT_GETCIPHERKEYS_ACTOR_H -#include "fdbserver/EncryptKeyProxyInterface.h" -#include "fdbserver/GetEncryptCipherKeys.h" +#include "fdbclient/BlobCipher.h" +#include "fdbclient/EncryptKeyProxyInterface.h" +#include "fdbrpc/Stats.h" +#include "flow/Knobs.h" #include "flow/IRandom.h" -#include +#include +#include -namespace { +#include "flow/actorcompiler.h" // This must be the last #include. -Optional getEncryptKeyProxyId(const Reference const>& db) { - return db->get().encryptKeyProxy.map([](EncryptKeyProxyInterface proxy) { return proxy.id(); }); +template +Optional getEncryptKeyProxyId(const Reference const>& db) { + return db->get().encryptKeyProxy.template map([](EncryptKeyProxyInterface proxy) { return proxy.id(); }); } -ACTOR Future onEncryptKeyProxyChange(Reference const> db) { +ACTOR template +Future onEncryptKeyProxyChange(Reference const> db) { state Optional previousProxyId = getEncryptKeyProxyId(db); state Optional currentProxyId; loop { @@ -46,9 +57,9 @@ ACTOR Future onEncryptKeyProxyChange(Reference cons return Void(); } -ACTOR Future getUncachedLatestEncryptCipherKeys( - Reference const> db, - EKPGetLatestBaseCipherKeysRequest request) { +ACTOR template +Future getUncachedLatestEncryptCipherKeys(Reference const> db, + EKPGetLatestBaseCipherKeysRequest request) { Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. @@ -73,11 +84,14 @@ ACTOR Future getUncachedLatestEncryptCipherKeys } } -} // anonymous namespace - -ACTOR Future>> getLatestEncryptCipherKeys( - Reference const> db, - std::unordered_map domains) { +// Get latest cipher keys for given encryption domains. It tries to get the cipher keys from local cache. +// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future>> getLatestEncryptCipherKeys( + Reference const> db, + std::unordered_map domains, + BlobCipherMetrics::UsageType usageType) { state Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); state std::unordered_map> cipherKeys; state EKPGetLatestBaseCipherKeysRequest request; @@ -101,6 +115,7 @@ ACTOR Future> return cipherKeys; } // Fetch any uncached cipher keys. + state double startTime = now(); loop choose { when(EKPGetLatestBaseCipherKeysReply reply = wait(getUncachedLatestEncryptCipherKeys(db, request))) { // Insert base cipher keys into cache and construct result. @@ -129,13 +144,30 @@ ACTOR Future> // In case encryptKeyProxy has changed, retry the request. when(wait(onEncryptKeyProxyChange(db))) {} } + double elapsed = now() - startTime; + BlobCipherMetrics::getInstance()->getLatestCipherKeysLatency.addMeasurement(elapsed); + BlobCipherMetrics::counters(usageType).getLatestCipherKeysLatency.addMeasurement(elapsed); return cipherKeys; } -namespace { +// Get latest cipher key for given a encryption domain. It tries to get the cipher key from the local cache. +// In case of cache miss, it fetches the cipher key from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future> getLatestEncryptCipherKey(Reference const> db, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName, + BlobCipherMetrics::UsageType usageType) { + std::unordered_map domains({ { domainId, domainName } }); + std::unordered_map> cipherKey = + wait(getLatestEncryptCipherKeys(db, domains, usageType)); -ACTOR Future getUncachedEncryptCipherKeys(Reference const> db, - EKPGetBaseCipherKeysByIdsRequest request) { + return cipherKey.at(domainId); +} + +ACTOR template +Future getUncachedEncryptCipherKeys(Reference const> db, + EKPGetBaseCipherKeysByIdsRequest request) { Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. @@ -162,11 +194,14 @@ ACTOR Future getUncachedEncryptCipherKeys(Refere using BaseCipherIndex = std::pair; -} // anonymous namespace - -ACTOR Future>> getEncryptCipherKeys( - Reference const> db, - std::unordered_set cipherDetails) { +// Get cipher keys specified by the list of cipher details. It tries to get the cipher keys from local cache. +// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future>> getEncryptCipherKeys( + Reference const> db, + std::unordered_set cipherDetails, + BlobCipherMetrics::UsageType usageType) { state Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); state std::unordered_map> cipherKeys; state std::unordered_set> uncachedBaseCipherIds; @@ -195,6 +230,7 @@ ACTOR Future>> ge id.first /*domainId*/, id.second /*baseCipherId*/, StringRef() /*domainName*/, request.arena); } // Fetch any uncached cipher keys. + state double startTime = now(); loop choose { when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request))) { std::unordered_map> baseCipherKeys; @@ -230,30 +266,49 @@ ACTOR Future>> ge // In case encryptKeyProxy has changed, retry the request. when(wait(onEncryptKeyProxyChange(db))) {} } + double elapsed = now() - startTime; + BlobCipherMetrics::getInstance()->getCipherKeysLatency.addMeasurement(elapsed); + BlobCipherMetrics::counters(usageType).getCipherKeysLatency.addMeasurement(elapsed); return cipherKeys; } -ACTOR Future getLatestSystemEncryptCipherKeys(Reference const> db) { - static std::unordered_map domains = { - { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }, - { ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME } - }; +struct TextAndHeaderCipherKeys { + Reference cipherTextKey; + Reference cipherHeaderKey; +}; + +ACTOR template +Future getLatestEncryptCipherKeysForDomain(Reference const> db, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName, + BlobCipherMetrics::UsageType usageType) { + std::unordered_map domains; + domains[domainId] = domainName; + domains[ENCRYPT_HEADER_DOMAIN_ID] = FDB_ENCRYPT_HEADER_DOMAIN_NAME; std::unordered_map> cipherKeys = - wait(getLatestEncryptCipherKeys(db, domains)); - ASSERT(cipherKeys.count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) > 0); + wait(getLatestEncryptCipherKeys(db, domains, usageType)); + ASSERT(cipherKeys.count(domainId) > 0); ASSERT(cipherKeys.count(ENCRYPT_HEADER_DOMAIN_ID) > 0); - TextAndHeaderCipherKeys result{ cipherKeys.at(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID), - cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) }; + TextAndHeaderCipherKeys result{ cipherKeys.at(domainId), cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) }; ASSERT(result.cipherTextKey.isValid()); ASSERT(result.cipherHeaderKey.isValid()); return result; } -ACTOR Future getEncryptCipherKeys(Reference const> db, - BlobCipherEncryptHeader header) { +template +Future getLatestSystemEncryptCipherKeys(const Reference const>& db, + BlobCipherMetrics::UsageType usageType) { + return getLatestEncryptCipherKeysForDomain( + db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType); +} + +ACTOR template +Future getEncryptCipherKeys(Reference const> db, + BlobCipherEncryptHeader header, + BlobCipherMetrics::UsageType usageType) { std::unordered_set cipherDetails{ header.cipherTextDetails, header.cipherHeaderDetails }; std::unordered_map> cipherKeys = - wait(getEncryptCipherKeys(db, cipherDetails)); + wait(getEncryptCipherKeys(db, cipherDetails, usageType)); ASSERT(cipherKeys.count(header.cipherTextDetails) > 0); ASSERT(cipherKeys.count(header.cipherHeaderDetails) > 0); TextAndHeaderCipherKeys result{ cipherKeys.at(header.cipherTextDetails), @@ -262,3 +317,6 @@ ACTOR Future getEncryptCipherKeys(Reference readVersion, ReadBlobGranuleContext granuleContext) = 0; + virtual ThreadFuture>> readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) = 0; + + virtual ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) = 0; + + virtual ThreadFuture>> + summarizeBlobGranules(const KeyRangeRef& keyRange, Optional summaryVersion, int rangeLimit) = 0; + virtual void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) = 0; virtual void set(const KeyRef& key, const ValueRef& value) = 0; virtual void clear(const KeyRef& begin, const KeyRef& end) = 0; diff --git a/fdbclient/include/fdbclient/IConfigTransaction.h b/fdbclient/include/fdbclient/IConfigTransaction.h index 9246e4016e..8451e86f21 100644 --- a/fdbclient/include/fdbclient/IConfigTransaction.h +++ b/fdbclient/include/fdbclient/IConfigTransaction.h @@ -64,7 +64,13 @@ public: Version* readVersionOut) override { throw client_invalid_operation(); } + Future>> summarizeBlobGranules(KeyRange const& range, + Optional readVersion, + int rangeLimit) override { + throw client_invalid_operation(); + } Future getEstimatedRangeSizeBytes(KeyRange const& keys) override { throw client_invalid_operation(); } + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) override { throw client_invalid_operation(); } void addReadConflictRange(KeyRangeRef const& keys) override { throw client_invalid_operation(); } void makeSelfConflicting() override { throw client_invalid_operation(); } void atomicOp(KeyRef const& key, ValueRef const& operand, uint32_t operationType) override { diff --git a/fdbclient/include/fdbclient/ISingleThreadTransaction.h b/fdbclient/include/fdbclient/ISingleThreadTransaction.h index 6143ec8605..71a0897693 100644 --- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h @@ -85,6 +85,10 @@ public: Version begin, Optional readVersion, Version* readVersionOut = nullptr) = 0; + virtual Future>> summarizeBlobGranules(KeyRange const& range, + Optional summaryVersion, + int rangeLimit) = 0; + virtual void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) = 0; virtual void addReadConflictRange(KeyRangeRef const& keys) = 0; virtual void makeSelfConflicting() = 0; virtual void atomicOp(KeyRef const& key, ValueRef const& operand, uint32_t operationType) = 0; diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h index a3fee57644..293a90ba07 100644 --- a/fdbclient/include/fdbclient/KeyBackedTypes.h +++ b/fdbclient/include/fdbclient/KeyBackedTypes.h @@ -29,6 +29,7 @@ #include "fdbclient/GenericTransactionHelper.h" #include "fdbclient/Subspace.h" #include "flow/ObjectSerializer.h" +#include "flow/Platform.h" #include "flow/genericactors.actor.h" #include "flow/serialize.h" @@ -305,6 +306,14 @@ public: tr->atomicOp(key, BinaryWriter::toValue(val, Unversioned()), type); } + template + void setVersionstamp(Transaction tr, T const& val, int offset) { + tr->atomicOp( + key, + BinaryWriter::toValue(val, Unversioned()).withSuffix(StringRef(reinterpret_cast(&offset), 4)), + MutationRef::SetVersionstampedValue); + } + template void clear(Transaction tr) { tr->clear(key); diff --git a/fdbclient/include/fdbclient/KeyRangeMap.h b/fdbclient/include/fdbclient/KeyRangeMap.h index 88cce027a8..f88dc72dda 100644 --- a/fdbclient/include/fdbclient/KeyRangeMap.h +++ b/fdbclient/include/fdbclient/KeyRangeMap.h @@ -136,6 +136,16 @@ Future krmGetRanges(Reference const& tr, KeyRange const& keys, int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); +Future krmGetRangesUnaligned(Transaction* const& tr, + Key const& mapPrefix, + KeyRange const& keys, + int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, + int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); +Future krmGetRangesUnaligned(Reference const& tr, + Key const& mapPrefix, + KeyRange const& keys, + int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, + int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); void krmSetPreviouslyEmptyRange(Transaction* tr, const KeyRef& mapPrefix, const KeyRangeRef& keys, @@ -162,7 +172,7 @@ Future krmSetRangeCoalescing(Reference const& t KeyRange const& range, KeyRange const& maxRange, Value const& value); -RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv); +RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align = true); template std::vector> KeyRangeMap::getAffectedRangesAfterInsertion( diff --git a/fdbclient/include/fdbclient/ManagementAPI.actor.h b/fdbclient/include/fdbclient/ManagementAPI.actor.h index 1268185b1b..c0725324c8 100644 --- a/fdbclient/include/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h @@ -57,7 +57,8 @@ struct IQuorumChange : ReferenceCounted { // Change to use the given set of coordination servers ACTOR Future> changeQuorumChecker(Transaction* tr, ClusterConnectionString* conn, - std::string newName); + std::string newName, + bool disableConfigDB); ACTOR Future changeQuorum(Database cx, Reference change); Reference autoQuorumChange(int desired = -1); Reference nameQuorumChange(std::string const& name, Reference const& other); diff --git a/fdbclient/include/fdbclient/Metacluster.h b/fdbclient/include/fdbclient/Metacluster.h index 99abed564b..39b876b7ae 100644 --- a/fdbclient/include/fdbclient/Metacluster.h +++ b/fdbclient/include/fdbclient/Metacluster.h @@ -20,7 +20,7 @@ #ifndef FDBCLIENT_METACLUSTER_H #define FDBCLIENT_METACLUSTER_H -#include "CoordinationInterface.h" +#include "fdbclient/CoordinationInterface.h" #include "json_spirit/json_spirit_value.h" #pragma once @@ -53,6 +53,8 @@ struct Traceable : std::true_type { } }; +std::string clusterTypeToString(const ClusterType& clusterType); + // Represents the various states that a data cluster could be in. // // READY - the data cluster is active @@ -98,6 +100,15 @@ struct DataClusterEntry { } }; +struct MetaclusterMetrics { + int numTenants = 0; + int numDataClusters = 0; + int tenantGroupCapacity = 0; + int tenantGroupsAllocated = 0; + + MetaclusterMetrics() = default; +}; + struct MetaclusterRegistrationEntry { constexpr static FileIdentifier file_identifier = 13448589; @@ -180,4 +191,4 @@ struct MetaclusterMetadata { static KeyBackedObjectProperty& metaclusterRegistration(); }; -#endif \ No newline at end of file +#endif diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 71b839e47b..7fce05e810 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -115,6 +115,9 @@ struct ManagementClusterMetadata { static KeyBackedSet clusterTenantGroupIndex; }; +// Helper function to compute metacluster capacity by passing the result of MetaclusterAPI::listClusters +std::pair metaclusterCapacity(std::map const& clusters); + ACTOR Future> openDatabase(ClusterConnectionString connectionString); ACTOR template @@ -1025,8 +1028,9 @@ ACTOR template Future managementClusterRemoveTenantFromGroup(Transaction tr, TenantName tenantName, TenantMapEntry tenantEntry, - DataClusterMetadata* clusterMetadata) { - state bool updateClusterCapacity = !tenantEntry.tenantGroup.present(); + DataClusterMetadata* clusterMetadata, + bool isRenamePair = false) { + state bool updateClusterCapacity = !tenantEntry.tenantGroup.present() && !isRenamePair; if (tenantEntry.tenantGroup.present()) { ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase( tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName)); @@ -1048,7 +1052,7 @@ Future managementClusterRemoveTenantFromGroup(Transaction tr, } // Update the tenant group count information for the assigned cluster if this tenant group was erased so we - // can use the freed capacity + // can use the freed capacity. if (updateClusterCapacity) { DataClusterEntry updatedEntry = clusterMetadata->entry; --updatedEntry.allocated.numTenantGroups; @@ -1064,6 +1068,7 @@ Future managementClusterRemoveTenantFromGroup(Transaction tr, template struct CreateTenantImpl { MetaclusterOperationContext ctx; + bool preferAssignedCluster; // Initialization parameters TenantName tenantName; @@ -1072,8 +1077,12 @@ struct CreateTenantImpl { // Parameter set if tenant creation permanently fails on the data cluster Optional replaceExistingTenantId; - CreateTenantImpl(Reference managementDb, TenantName tenantName, TenantMapEntry tenantEntry) - : ctx(managementDb), tenantName(tenantName), tenantEntry(tenantEntry) {} + CreateTenantImpl(Reference managementDb, + bool preferAssignedCluster, + TenantName tenantName, + TenantMapEntry tenantEntry) + : ctx(managementDb), preferAssignedCluster(preferAssignedCluster), tenantName(tenantName), + tenantEntry(tenantEntry) {} ACTOR static Future checkClusterAvailability(Reference dataClusterDb, ClusterName clusterName) { @@ -1106,9 +1115,15 @@ struct CreateTenantImpl { } else if (!self->replaceExistingTenantId.present() || self->replaceExistingTenantId.get() != existingEntry.get().id) { // The tenant creation has already started, so resume where we left off - self->tenantEntry = existingEntry.get(); ASSERT(existingEntry.get().assignedCluster.present()); - + if (self->preferAssignedCluster && + existingEntry.get().assignedCluster.get() != self->tenantEntry.assignedCluster.get()) { + TraceEvent("MetaclusterCreateTenantClusterMismatch") + .detail("Preferred", self->tenantEntry.assignedCluster.get()) + .detail("Actual", existingEntry.get().assignedCluster.get()); + throw invalid_tenant_configuration(); + } + self->tenantEntry = existingEntry.get(); wait(self->ctx.setCluster(tr, existingEntry.get().assignedCluster.get())); return true; } else { @@ -1151,31 +1166,52 @@ struct CreateTenantImpl { if (groupEntry.present()) { ASSERT(groupEntry.get().assignedCluster.present()); + if (self->preferAssignedCluster && + groupEntry.get().assignedCluster.get() != self->tenantEntry.assignedCluster.get()) { + TraceEvent("MetaclusterCreateTenantGroupClusterMismatch") + .detail("TenantGroupCluster", groupEntry.get().assignedCluster.get()) + .detail("SpecifiedCluster", self->tenantEntry.assignedCluster.get()); + throw invalid_tenant_configuration(); + } return std::make_pair(groupEntry.get().assignedCluster.get(), true); } } - // Get a set of the most full clusters that still have capacity - state KeyBackedSet::RangeResultType availableClusters = - wait(ManagementClusterMetadata::clusterCapacityIndex.getRange( - tr, {}, {}, CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, Snapshot::False, Reverse::True)); - - if (availableClusters.results.empty()) { - throw metacluster_no_capacity(); - } - state std::vector>> dataClusterDbs; - for (auto clusterTuple : availableClusters.results) { - dataClusterDbs.push_back(getAndOpenDatabase(tr, clusterTuple.getString(1))); - } - - wait(waitForAll(dataClusterDbs)); - - // Check the availability of our set of clusters + state std::vector dataClusterNames; state std::vector> clusterAvailabilityChecks; - for (int i = 0; i < availableClusters.results.size(); ++i) { - clusterAvailabilityChecks.push_back( - checkClusterAvailability(dataClusterDbs[i].get(), availableClusters.results[i].getString(1))); + // Get a set of the most full clusters that still have capacity + // If preferred cluster is specified, look for that one. + if (self->preferAssignedCluster) { + DataClusterMetadata dataClusterMetadata = + wait(getClusterTransaction(tr, self->tenantEntry.assignedCluster.get())); + if (!dataClusterMetadata.entry.hasCapacity()) { + throw cluster_no_capacity(); + } + dataClusterNames.push_back(self->tenantEntry.assignedCluster.get()); + } else { + state KeyBackedSet::RangeResultType availableClusters = + wait(ManagementClusterMetadata::clusterCapacityIndex.getRange( + tr, + {}, + {}, + CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, + Snapshot::False, + Reverse::True)); + if (availableClusters.results.empty()) { + throw metacluster_no_capacity(); + } + for (auto clusterTuple : availableClusters.results) { + dataClusterNames.push_back(clusterTuple.getString(1)); + } + } + for (auto dataClusterName : dataClusterNames) { + dataClusterDbs.push_back(getAndOpenDatabase(tr, dataClusterName)); + } + wait(waitForAll(dataClusterDbs)); + // Check the availability of our set of clusters + for (int i = 0; i < dataClusterDbs.size(); ++i) { + clusterAvailabilityChecks.push_back(checkClusterAvailability(dataClusterDbs[i].get(), dataClusterNames[i])); } // Wait for a successful availability check from some cluster. We prefer the most full cluster, but if it @@ -1322,7 +1358,7 @@ struct CreateTenantImpl { ACTOR template Future createTenant(Reference db, TenantName name, TenantMapEntry tenantEntry) { - state CreateTenantImpl impl(db, name, tenantEntry); + state CreateTenantImpl impl(db, tenantEntry.assignedCluster.present(), name, tenantEntry); wait(impl.run()); return Void(); } @@ -1337,6 +1373,9 @@ struct DeleteTenantImpl { // Parameters set in getAssignedLocation int64_t tenantId; + // Parameters set in markTenantInRemovingState + Optional pairName; + DeleteTenantImpl(Reference managementDb, TenantName tenantName) : ctx(managementDb), tenantName(tenantName) {} // Loads the cluster details for the cluster where the tenant is assigned. @@ -1348,8 +1387,18 @@ struct DeleteTenantImpl { throw tenant_not_found(); } - self->tenantId = tenantEntry.get().id; + // Disallow removing the "new" name of a renamed tenant before it completes + if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) { + throw tenant_not_found(); + } + if (tenantEntry.get().tenantState == TenantState::REMOVING) { + if (tenantEntry.get().renamePair.present()) { + self->pairName = tenantEntry.get().renamePair.get(); + } + } + + self->tenantId = tenantEntry.get().id; wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get())); return tenantEntry.get().tenantState == TenantState::REMOVING; } @@ -1381,14 +1430,34 @@ struct DeleteTenantImpl { state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { - // The tenant must have been removed simultaneously - return Void(); + throw tenant_not_found(); } if (tenantEntry.get().tenantState != TenantState::REMOVING) { - TenantMapEntry updatedEntry = tenantEntry.get(); + // Disallow removing the "new" name of a renamed tenant before it completes + if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) { + throw tenant_not_found(); + } + state TenantMapEntry updatedEntry = tenantEntry.get(); + // Check if we are deleting a tenant in the middle of a rename + if (updatedEntry.renamePair.present()) { + ASSERT(updatedEntry.tenantState == TenantState::RENAMING_FROM); + self->pairName = updatedEntry.renamePair.get(); + } updatedEntry.tenantState = TenantState::REMOVING; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + // If this has a rename pair, also mark the other entry for deletion + if (self->pairName.present()) { + state Optional pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get())); + TenantMapEntry updatedPairEntry = pairEntry.get(); + // Sanity check that our pair has us named as their partner + ASSERT(updatedPairEntry.renamePair.present()); + ASSERT(updatedPairEntry.renamePair.get() == self->tenantName); + ASSERT(updatedPairEntry.id == self->tenantId); + CODE_PROBE(true, "marking pair tenant in removing state"); + updatedPairEntry.tenantState = TenantState::REMOVING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry); + } } return Void(); @@ -1396,8 +1465,16 @@ struct DeleteTenantImpl { // Delete the tenant and related metadata on the management cluster ACTOR static Future deleteTenantFromManagementCluster(DeleteTenantImpl* self, - Reference tr) { - state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + Reference tr, + bool pairDelete = false) { + // If pair is present, and this is not already a pair delete, call this function recursively + state Future pairFuture = Void(); + if (!pairDelete && self->pairName.present()) { + CODE_PROBE(true, "deleting pair tenant from management cluster"); + pairFuture = deleteTenantFromManagementCluster(self, tr, true); + } + state TenantName tenantName = pairDelete ? self->pairName.get() : self->tenantName; + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantName)); if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { return Void(); @@ -1406,7 +1483,7 @@ struct DeleteTenantImpl { ASSERT(tenantEntry.get().tenantState == TenantState::REMOVING); // Erase the tenant entry itself - ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->tenantName); + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName); ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id); // This is idempotent because this function is only called if the tenant is in the map @@ -1416,12 +1493,13 @@ struct DeleteTenantImpl { // Remove the tenant from the cluster -> tenant index ManagementClusterMetadata::clusterTenantIndex.erase( - tr, Tuple::makeTuple(tenantEntry.get().assignedCluster.get(), self->tenantName, tenantEntry.get().id)); + tr, Tuple::makeTuple(tenantEntry.get().assignedCluster.get(), tenantName, self->tenantId)); // Remove the tenant from its tenant group wait(managementClusterRemoveTenantFromGroup( - tr, self->tenantName, tenantEntry.get(), &self->ctx.dataClusterMetadata.get())); + tr, tenantName, tenantEntry.get(), &self->ctx.dataClusterMetadata.get(), pairDelete)); + wait(pairFuture); return Void(); } @@ -1441,10 +1519,17 @@ struct DeleteTenantImpl { // Delete tenant on the data cluster wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { - return TenantAPI::deleteTenantTransaction( - tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA); + // If the removed tenant is being renamed, attempt to delete both the old and new names. + // At most one should be present with the given ID, and the other will be a no-op. + Future pairDelete = Void(); + if (self->pairName.present()) { + CODE_PROBE(true, "deleting pair tenant from data cluster"); + pairDelete = TenantAPI::deleteTenantTransaction( + tr, self->pairName.get(), self->tenantId, ClusterType::METACLUSTER_DATA); + } + return pairDelete && TenantAPI::deleteTenantTransaction( + tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA); })); - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { return deleteTenantFromManagementCluster(self, tr); })); @@ -1527,7 +1612,7 @@ struct ConfigureTenantImpl { // Removing a tenant group is only possible if we have capacity for more groups on the current cluster else if (!desiredGroup.present()) { if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) { - throw metacluster_no_capacity(); + throw cluster_no_capacity(); } wait(managementClusterRemoveTenantFromGroup( @@ -1543,7 +1628,7 @@ struct ConfigureTenantImpl { // If we are creating a new tenant group, we need to have capacity on the current cluster if (!tenantGroupEntry.present()) { if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) { - throw metacluster_no_capacity(); + throw cluster_no_capacity(); } wait(managementClusterRemoveTenantFromGroup( tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); @@ -1563,7 +1648,14 @@ struct ConfigureTenantImpl { // We don't currently support movement between groups on different clusters else { - throw cluster_no_capacity(); + TraceEvent("TenantGroupChangeToDifferentCluster") + .detail("Tenant", self->tenantName) + .detail("OriginalGroup", tenantEntry.tenantGroup) + .detail("DesiredGroup", desiredGroup) + .detail("TenantAssignedCluster", tenantEntry.assignedCluster) + .detail("DesiredGroupAssignedCluster", tenantGroupEntry.get().assignedCluster); + + throw invalid_tenant_configuration(); } } @@ -1656,6 +1748,275 @@ Future configureTenant(Reference db, wait(impl.run()); return Void(); } + +template +struct RenameTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName oldName; + TenantName newName; + + // Parameters set in markTenantsInRenamingState + int64_t tenantId = -1; + int64_t configurationSequenceNum = -1; + + RenameTenantImpl(Reference managementDb, TenantName oldName, TenantName newName) + : ctx(managementDb), oldName(oldName), newName(newName) {} + + // Delete the tenant and related metadata on the management cluster + ACTOR static Future deleteTenantFromManagementCluster(RenameTenantImpl* self, + Reference tr, + TenantMapEntry tenantEntry) { + // Erase the tenant entry itself + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName); + + // Remove old tenant from tenant count + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.assignedCluster.get(), -1, MutationRef::AddValue); + + // Clean up cluster based tenant indices and remove the old entry from its tenant group + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), self->oldName, self->tenantId)); + + // Remove the tenant from its tenant group + wait(managementClusterRemoveTenantFromGroup( + tr, self->oldName, tenantEntry, &self->ctx.dataClusterMetadata.get(), true)); + + return Void(); + } + + ACTOR static Future markTenantsInRenamingState(RenameTenantImpl* self, + Reference tr) { + state TenantMapEntry oldTenantEntry; + state Optional newTenantEntry; + wait(store(oldTenantEntry, getTenantTransaction(tr, self->oldName)) && + store(newTenantEntry, tryGetTenantTransaction(tr, self->newName))); + + if (self->tenantId != -1 && oldTenantEntry.id != self->tenantId) { + // The tenant must have been removed simultaneously + CODE_PROBE(true, "Metacluster rename old tenant ID mismatch"); + throw tenant_removed(); + } + + // If marked for deletion, abort the rename + if (oldTenantEntry.tenantState == TenantState::REMOVING) { + CODE_PROBE(true, "Metacluster rename candidates marked for deletion"); + throw tenant_removed(); + } + + // If the new entry is present, we can only continue if this is a retry of the same rename + // To check this, verify both entries are in the correct state + // and have each other as pairs + if (newTenantEntry.present()) { + if (newTenantEntry.get().tenantState == TenantState::RENAMING_TO && + oldTenantEntry.tenantState == TenantState::RENAMING_FROM && newTenantEntry.get().renamePair.present() && + newTenantEntry.get().renamePair.get() == self->oldName && oldTenantEntry.renamePair.present() && + oldTenantEntry.renamePair.get() == self->newName) { + wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get())); + self->tenantId = newTenantEntry.get().id; + self->configurationSequenceNum = newTenantEntry.get().configurationSequenceNum; + CODE_PROBE(true, "Metacluster rename retry in progress"); + return Void(); + } else { + CODE_PROBE(true, "Metacluster rename new name already exists"); + throw tenant_already_exists(); + }; + } else { + if (self->tenantId == -1) { + self->tenantId = oldTenantEntry.id; + } + ++oldTenantEntry.configurationSequenceNum; + self->configurationSequenceNum = oldTenantEntry.configurationSequenceNum; + wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get())); + if (oldTenantEntry.tenantState != TenantState::READY) { + CODE_PROBE(true, "Metacluster unable to proceed with rename operation"); + throw invalid_tenant_state(); + } + } + + // Check cluster capacity. If we would exceed the amount due to temporary extra tenants + // then we deny the rename request altogether. + int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD( + tr, oldTenantEntry.assignedCluster.get(), Snapshot::False, 0)); + + if (clusterTenantCount + 1 > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + TenantMapEntry updatedOldEntry = oldTenantEntry; + TenantMapEntry updatedNewEntry(updatedOldEntry); + ASSERT(updatedOldEntry.configurationSequenceNum == self->configurationSequenceNum); + ASSERT(updatedNewEntry.configurationSequenceNum == self->configurationSequenceNum); + updatedOldEntry.tenantState = TenantState::RENAMING_FROM; + updatedNewEntry.tenantState = TenantState::RENAMING_TO; + updatedOldEntry.renamePair = self->newName; + updatedNewEntry.renamePair = self->oldName; + + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + + // Add temporary tenant to tenantCount to prevent exceeding capacity during a rename + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, updatedNewEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(updatedNewEntry.assignedCluster.get(), self->newName, self->tenantId)); + + // Add new name to tenant group. It should already exist since the old name was part of it. + managementClusterAddTenantToGroup( + tr, self->newName, updatedNewEntry, &self->ctx.dataClusterMetadata.get(), true); + return Void(); + } + + ACTOR static Future updateDataCluster(RenameTenantImpl* self, Reference tr) { + ASSERT(self->tenantId != -1); + ASSERT(self->configurationSequenceNum != -1); + wait(TenantAPI::renameTenantTransaction(tr, + self->oldName, + self->newName, + self->tenantId, + ClusterType::METACLUSTER_DATA, + self->configurationSequenceNum)); + return Void(); + } + + ACTOR static Future finishRenameFromManagementCluster(RenameTenantImpl* self, + Reference tr) { + state Optional oldTenantEntry; + state Optional newTenantEntry; + wait(store(oldTenantEntry, tryGetTenantTransaction(tr, self->oldName)) && + store(newTenantEntry, tryGetTenantTransaction(tr, self->newName))); + + // Another (or several other) operations have already removed/changed the old entry + // Possible for the new entry to also have been tampered with, + // so it may or may not be present with or without the same id, which are all + // legal states. Assume the rename completed properly in this case + if (!oldTenantEntry.present() || oldTenantEntry.get().id != self->tenantId || + oldTenantEntry.get().configurationSequenceNum > self->configurationSequenceNum) { + CODE_PROBE(true, + "Metacluster finished rename with missing entries, mismatched id, and/or mismatched " + "configuration sequence."); + return Void(); + } + if (oldTenantEntry.get().tenantState == TenantState::REMOVING) { + ASSERT(newTenantEntry.get().tenantState == TenantState::REMOVING); + throw tenant_removed(); + } + ASSERT(newTenantEntry.present()); + ASSERT(newTenantEntry.get().id == self->tenantId); + + TenantMapEntry updatedOldEntry = oldTenantEntry.get(); + TenantMapEntry updatedNewEntry = newTenantEntry.get(); + + // Only update if in the expected state + if (updatedNewEntry.tenantState == TenantState::RENAMING_TO) { + updatedNewEntry.tenantState = TenantState::READY; + updatedNewEntry.renamePair.reset(); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName); + } + + // We will remove the old entry from the management cluster + // This should still be the same old entry since the tenantId matches from the check above. + wait(deleteTenantFromManagementCluster(self, tr, updatedOldEntry)); + return Void(); + } + + ACTOR static Future run(RenameTenantImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markTenantsInRenamingState(self, tr); })); + + // Rename tenant on the data cluster + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + } catch (Error& e) { + // Since we track the tenant entries on the management cluster, these error codes should only appear + // on a retry of the transaction, typically caused by commit_unknown_result. + // Operating on the assumption that the first transaction completed successfully, we keep going + // so we can finish the rename on the management cluster. + if (e.code() == error_code_tenant_not_found || e.code() == error_code_tenant_already_exists) { + CODE_PROBE(true, "Metacluster rename ran into commit_unknown_result"); + } else { + throw e; + } + } + + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return finishRenameFromManagementCluster(self, tr); + })); + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future renameTenant(Reference db, TenantName oldName, TenantName newName) { + state RenameTenantImpl impl(db, oldName, newName); + wait(impl.run()); + return Void(); +} + +template +Future> tryGetTenantGroupTransaction(Transaction tr, TenantGroupName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + return ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, name); +} + +ACTOR template +Future> tryGetTenantGroup(Reference db, TenantGroupName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional entry = wait(tryGetTenantGroupTransaction(tr, name)); + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future>> listTenantGroupsTransaction(Transaction tr, + TenantGroupName begin, + TenantGroupName end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange(tr, begin, end, limit)); + + return results.results; +} + +ACTOR template +Future>> listTenantGroups(Reference db, + TenantGroupName begin, + TenantGroupName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + std::vector> tenantGroups = + wait(listTenantGroupsTransaction(tr, begin, end, limit)); + return tenantGroups; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + } // namespace MetaclusterAPI #include "flow/unactorcompiler.h" diff --git a/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h b/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h index 265ce0b3fe..888c8ec01a 100644 --- a/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h +++ b/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h @@ -28,16 +28,24 @@ template class AbortableSingleAssignmentVar final : public ThreadSingleAssignmentVar, public ThreadCallback { public: AbortableSingleAssignmentVar(ThreadFuture future, ThreadFuture abortSignal) - : future(future), abortSignal(abortSignal), hasBeenSet(false), callbacksCleared(false) { + : future(future), abortSignal(abortSignal), hasBeenSet(false), callbacksCleared(true) { int userParam; ThreadSingleAssignmentVar::addref(); ThreadSingleAssignmentVar::addref(); - // abortSignal comes first, because otherwise future could immediately call fire/error and attempt to remove - // this callback from abortSignal prematurely abortSignal.callOrSetAsCallback(this, userParam, 0); future.callOrSetAsCallback(this, userParam, 0); + + // One of the signals could be already fired + // Make sure that the other is cancelled, and the references removed + lock.enter(); + callbacksCleared = false; + bool hasBeenSet_ = hasBeenSet; + lock.leave(); + if (hasBeenSet_) { + cancelCallbacks(); + } } void cancel() override { @@ -104,12 +112,28 @@ private: callbacksCleared = true; lock.leave(); - future.getPtr()->addref(); // Cancel will delref our future, but we don't want to destroy it until this - // callback gets destroyed - future.getPtr()->cancel(); + bool notificationRequired = true; + + if (future.clearCallback(this)) { + ThreadSingleAssignmentVar::delref(); + } else { + notificationRequired = false; + future.getPtr()->addref(); // Cancel will delref our future, but we don't want to destroy it until this + // callback gets destroyed + future.getPtr()->cancel(); + } if (abortSignal.clearCallback(this)) { ThreadSingleAssignmentVar::delref(); + } else { + notificationRequired = false; + } + + if (notificationRequired) { + // The future has been cancelled before any of the signals were + // fired. Notify the futures about the cancellation + ASSERT(!hasBeenSet); + ThreadSingleAssignmentVar::sendError(operation_cancelled()); } } else { lock.leave(); diff --git a/fdbclient/include/fdbclient/MultiVersionTransaction.h b/fdbclient/include/fdbclient/MultiVersionTransaction.h index dc4088a57b..f74026d0ac 100644 --- a/fdbclient/include/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h @@ -26,6 +26,7 @@ #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/IClientApi.h" +#include "flow/ApiVersion.h" #include "flow/ProtocolVersion.h" #include "flow/ThreadHelper.actor.h" @@ -89,6 +90,14 @@ struct FdbCApi : public ThreadSafeReferenceCounted { const void* endKey; int endKeyLength; } FDBKeyRange; + + typedef struct granulesummary { + FDBKeyRange key_range; + int64_t snapshot_version; + int64_t snapshot_size; + int64_t delta_version; + int64_t delta_size; + } FDBGranuleSummary; #pragma pack(pop) typedef struct readgranulecontext { @@ -195,7 +204,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length, - Optional version); + int64_t version); // Tenant fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction); @@ -298,21 +307,47 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int end_key_name_length, int64_t chunkSize); - FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* db, + FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length, int rangeLimit); - FDBResult* (*transactionReadBlobGranules)(FDBTransaction* db, + FDBResult* (*transactionReadBlobGranules)(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length, int64_t beginVersion, - int64_t readVersion, - FDBReadBlobGranuleContext granule_context); + int64_t readVersion); + + FDBFuture* (*transactionReadBlobGranulesStart)(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut); + + FDBResult* (*transactionReadBlobGranulesFinish)(FDBTransaction* tr, + FDBFuture* startFuture, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granule_context); + + FDBFuture* (*transactionSummarizeBlobGranules)(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t summaryVersion, + int rangeLimit); FDBFuture* (*transactionCommit)(FDBTransaction* tr); fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion); @@ -333,7 +368,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { fdb_error_t (*futureGetDatabase)(FDBFuture* f, FDBDatabase** outDb); fdb_error_t (*futureGetInt64)(FDBFuture* f, int64_t* outValue); fdb_error_t (*futureGetUInt64)(FDBFuture* f, uint64_t* outValue); - fdb_error_t (*futureGetBool)(FDBFuture* f, bool* outValue); + fdb_error_t (*futureGetBool)(FDBFuture* f, fdb_bool_t* outValue); fdb_error_t (*futureGetError)(FDBFuture* f); fdb_error_t (*futureGetKey)(FDBFuture* f, uint8_t const** outKey, int* outKeyLength); fdb_error_t (*futureGetValue)(FDBFuture* f, fdb_bool_t* outPresent, uint8_t const** outValue, int* outValueLength); @@ -345,6 +380,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { FDBMappedKeyValue const** outKVM, int* outCount, fdb_bool_t* outMore); + fdb_error_t (*futureGetGranuleSummaryArray)(FDBFuture* f, const FDBGranuleSummary** out_summaries, int* outCount); fdb_error_t (*futureGetSharedState)(FDBFuture* f, DatabaseSharedState** outPtr); fdb_error_t (*futureSetCallback)(FDBFuture* f, FDBCallback callback, void* callback_parameter); void (*futureCancel)(FDBFuture* f); @@ -411,6 +447,22 @@ public: Optional readVersion, ReadBlobGranuleContext granule_context) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + + ThreadFuture>> summarizeBlobGranules(const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) override; + void addReadConflictRange(const KeyRangeRef& keys) override; void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override; @@ -616,6 +668,22 @@ public: Optional readVersion, ReadBlobGranuleContext granule_context) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + + ThreadFuture>> summarizeBlobGranules(const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) override; + void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override; void set(const KeyRef& key, const ValueRef& value) override; void clear(const KeyRef& begin, const KeyRef& end) override; @@ -681,6 +749,9 @@ private: template ThreadResult abortableTimeoutResult(ThreadFuture abortSignal); + template + ThreadResult abortableResult(ThreadResult result, ThreadFuture abortSignal); + TransactionInfo transaction; TransactionInfo getTransaction(); @@ -705,17 +776,22 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { IClientApi* api; bool failed; std::atomic_bool initialized; + int threadIndex; std::vector> threadCompletionHooks; ClientInfo() - : ClientDesc(std::string(), false, false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {} + : ClientDesc(std::string(), false, false), protocolVersion(0), api(nullptr), failed(true), initialized(false), + threadIndex(0) {} ClientInfo(IClientApi* api) - : ClientDesc("internal", false, false), protocolVersion(0), api(api), failed(false), initialized(false) {} - ClientInfo(IClientApi* api, std::string libPath, bool useFutureVersion) - : ClientDesc(libPath, true, useFutureVersion), protocolVersion(0), api(api), failed(false), initialized(false) {} + : ClientDesc("internal", false, false), protocolVersion(0), api(api), failed(false), initialized(false), + threadIndex(0) {} + ClientInfo(IClientApi* api, std::string libPath, bool useFutureVersion, int threadIndex) + : ClientDesc(libPath, true, useFutureVersion), protocolVersion(0), api(api), failed(false), initialized(false), + threadIndex(threadIndex) {} void loadVersion(); bool canReplace(Reference other) const; + std::string getTraceFileIdentifier(const std::string& baseIdentifier); }; class MultiVersionApi; @@ -1003,7 +1079,7 @@ public: }; std::map clusterSharedStateMap; - static bool apiVersionAtLeast(int minVersion); + ApiVersion getApiVersion() { return apiVersion; } private: MultiVersionApi(); @@ -1028,13 +1104,16 @@ private: bool networkStartSetup; volatile bool networkSetup; + bool disableBypass; volatile bool bypassMultiClientApi; volatile bool externalClient; - int apiVersion; + ApiVersion apiVersion; int nextThread = 0; int threadCount; std::string tmpDir; + bool traceShareBaseNameAmongThreads; + std::string traceFileIdentifier; Mutex lock; std::vector>>> options; diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 79c1aa09fb..3931182ab0 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -82,8 +82,6 @@ struct NetworkOptions { class Database { public: - enum { API_VERSION_LATEST = -1 }; - // Creates a database object that represents a connection to a cluster // This constructor uses a preallocated DatabaseContext that may have been created // on another thread @@ -98,6 +96,9 @@ public: IsInternal internal = IsInternal::True, LocalityData const& clientLocality = LocalityData()); + static Database createSimulatedExtraDatabase(std::string connectionString, + Optional defaultTenant = Optional()); + Database() {} // an uninitialized database can be destructed or reassigned safely; that's it void operator=(Database const& rhs) { db = rhs.db; } Database(Database const& rhs) : db(rhs.db) {} @@ -242,8 +243,8 @@ struct TransactionState : ReferenceCounted { Optional> authToken; Reference trLogInfo; TransactionOptions options; + Optional readOptions; - Optional debugID; TaskPriority taskID; SpanContext spanContext; UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False; @@ -365,19 +366,19 @@ private: public: // A method for streaming data from the storage server that is more efficient than getRange when reading large // amounts of data - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeySelector& begin, const KeySelector& end, int limit, Snapshot = Snapshot::False, Reverse = Reverse::False); - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeySelector& begin, const KeySelector& end, GetRangeLimits limits, Snapshot = Snapshot::False, Reverse = Reverse::False); - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeyRange& keys, int limit, Snapshot snapshot = Snapshot::False, @@ -389,7 +390,7 @@ public: snapshot, reverse); } - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeyRange& keys, GetRangeLimits limits, Snapshot snapshot = Snapshot::False, @@ -421,6 +422,12 @@ public: Optional readVersion, Version* readVersionOut = nullptr); + Future>> summarizeBlobGranules(const KeyRange& range, + Optional summaryVersion, + int rangeLimit); + + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats); + // If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key void set(const KeyRef& key, const ValueRef& value, AddConflictRange = AddConflictRange::True); void atomicOp(const KeyRef& key, @@ -455,7 +462,13 @@ public: void fullReset(); double getBackoff(int errCode); - void debugTransaction(UID dID) { trState->debugID = dID; } + void debugTransaction(UID dID) { + if (trState->readOptions.present()) { + trState->readOptions.get().debugID = dID; + } else { + trState->readOptions = ReadOptions(dID); + } + } VersionVector getVersionVector() const; SpanContext getSpanContext() const { return trState->spanContext; } @@ -550,8 +563,9 @@ ACTOR Future> getCheckpointMetaData(Database cx, // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed ACTOR Future checkSafeExclusions(Database cx, std::vector exclusions); +// Round up to the nearest page size inline uint64_t getWriteOperationCost(uint64_t bytes) { - return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1; + return (bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1; } // Create a transaction to set the value of system key \xff/conf/perpetual_storage_wiggle. If enable == true, the value diff --git a/fdbclient/include/fdbclient/ReadYourWrites.h b/fdbclient/include/fdbclient/ReadYourWrites.h index 46650be3d3..659f7768ae 100644 --- a/fdbclient/include/fdbclient/ReadYourWrites.h +++ b/fdbclient/include/fdbclient/ReadYourWrites.h @@ -20,7 +20,7 @@ #ifndef FDBCLIENT_READYOURWRITES_H #define FDBCLIENT_READYOURWRITES_H -#include "Status.h" +#include "fdbclient/Status.h" #pragma once #include "fdbclient/NativeAPI.actor.h" @@ -127,6 +127,11 @@ public: Optional readVersion, Version* readVersionOut) override; + Future>> summarizeBlobGranules(const KeyRange& range, + Optional summaryVersion, + int rangeLimit) override; + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) override; + void addReadConflictRange(KeyRangeRef const& keys) override; void makeSelfConflicting() override { tr.makeSelfConflicting(); } diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index bba13eb7bc..4be933d833 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -156,9 +156,14 @@ public: int PRIORITY_TEAM_FAILED; // Priority when a server in the team is excluded as failed int PRIORITY_TEAM_0_LEFT; int PRIORITY_SPLIT_SHARD; + int PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD; // Priority when a physical shard is oversize or anonymous // Data distribution bool SHARD_ENCODE_LOCATION_METADATA; // If true, location metadata will contain shard ID. + bool ENABLE_DD_PHYSICAL_SHARD; // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true. + int64_t MAX_PHYSICAL_SHARD_BYTES; + double PHYSICAL_SHARD_METRICS_DELAY; + double ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME; double READ_REBALANCE_CPU_THRESHOLD; // read rebalance only happens if the source servers' CPU > threshold int READ_REBALANCE_SRC_PARALLELISM; // the max count a server become a source server within a certain interval @@ -251,9 +256,8 @@ public: // Run storage enginee on a child process on the same machine with storage process bool REMOTE_KV_STORE; - // A delay to avoid race on file resources if the new kv store process started immediately after the previous kv - // store process died - double REMOTE_KV_STORE_INIT_DELAY; + // A delay to avoid race on file resources after seeing lock_file_failure + double REBOOT_KV_STORE_DELAY; // max waiting time for the remote kv store to initialize double REMOTE_KV_STORE_MAX_INIT_DURATION; @@ -298,9 +302,15 @@ public: int64_t REPLACE_CONTENTS_BYTES; // KeyValueStoreRocksDB + int ROCKSDB_READER_THREAD_PRIORITY; + int ROCKSDB_WRITER_THREAD_PRIORITY; + bool ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES; + int ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE; + int ROCKSDB_READ_RANGE_ROW_LIMIT; int ROCKSDB_BACKGROUND_PARALLELISM; int ROCKSDB_READ_PARALLELISM; int64_t ROCKSDB_MEMTABLE_BYTES; + bool ROCKSDB_LEVEL_STYLE_COMPACTION; bool ROCKSDB_UNSAFE_AUTO_FSYNC; int64_t ROCKSDB_PERIODIC_COMPACTION_SECONDS; int ROCKSDB_PREFIX_LEN; @@ -322,6 +332,7 @@ public: int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC; bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE; std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY; + bool ROCKSDB_DISABLE_AUTO_COMPACTIONS; bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead double ROCKSDB_PERFCONTEXT_SAMPLE_RATE; double ROCKSDB_METRICS_SAMPLE_INTERVAL; @@ -331,13 +342,16 @@ public: int64_t ROCKSDB_CAN_COMMIT_COMPACT_BYTES_LIMIT; int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD; int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD; + bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL; int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE; int64_t ROCKSDB_BLOCK_SIZE; bool ENABLE_SHARDED_ROCKSDB; int64_t ROCKSDB_WRITE_BUFFER_SIZE; + int64_t ROCKSDB_CF_WRITE_BUFFER_SIZE; int64_t ROCKSDB_MAX_TOTAL_WAL_SIZE; int64_t ROCKSDB_MAX_BACKGROUND_JOBS; int64_t ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD; + double ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY; // Leader election int MAX_NOTIFICATIONS; @@ -450,6 +464,7 @@ public: double ATTEMPT_RECRUITMENT_DELAY; double WAIT_FOR_DISTRIBUTOR_JOIN_DELAY; double WAIT_FOR_RATEKEEPER_JOIN_DELAY; + double WAIT_FOR_CONSISTENCYSCAN_JOIN_DELAY; double WAIT_FOR_BLOB_MANAGER_JOIN_DELAY; double WAIT_FOR_ENCRYPT_KEY_PROXY_JOIN_DELAY; double WORKER_FAILURE_TIME; @@ -463,6 +478,7 @@ public: double CHECK_REMOTE_HEALTH_INTERVAL; // Remote DC health refresh interval. double FORCE_RECOVERY_CHECK_DELAY; double RATEKEEPER_FAILURE_TIME; + double CONSISTENCYSCAN_FAILURE_TIME; double BLOB_MANAGER_FAILURE_TIME; double REPLACE_INTERFACE_DELAY; double REPLACE_INTERFACE_CHECK_DELAY; @@ -551,6 +567,8 @@ public: bool RATEKEEPER_PRINT_LIMIT_REASON; double RATEKEEPER_MIN_RATE; double RATEKEEPER_MAX_RATE; + double RATEKEEPER_BATCH_MIN_RATE; + double RATEKEEPER_BATCH_MAX_RATE; int64_t TARGET_BYTES_PER_STORAGE_SERVER; int64_t SPRING_BYTES_STORAGE_SERVER; @@ -559,9 +577,12 @@ public: int64_t SPRING_BYTES_STORAGE_SERVER_BATCH; int64_t STORAGE_HARD_LIMIT_BYTES; int64_t STORAGE_HARD_LIMIT_BYTES_OVERAGE; + int64_t STORAGE_HARD_LIMIT_BYTES_SPEED_UP_SIM; + int64_t STORAGE_HARD_LIMIT_BYTES_OVERAGE_SPEED_UP_SIM; int64_t STORAGE_HARD_LIMIT_VERSION_OVERAGE; int64_t STORAGE_DURABILITY_LAG_HARD_MAX; int64_t STORAGE_DURABILITY_LAG_SOFT_MAX; + bool STORAGE_INCLUDE_FEED_STORAGE_QUEUE; int64_t LOW_PRIORITY_STORAGE_QUEUE_BYTES; int64_t LOW_PRIORITY_DURABILITY_LAG; @@ -600,7 +621,8 @@ public: double GLOBAL_TAG_THROTTLING_MIN_RATE; // Used by global tag throttling counters double GLOBAL_TAG_THROTTLING_FOLDING_TIME; - double GLOBAL_TAG_THROTTLING_TRACE_INTERVAL; + // Cost multiplier for writes (because write operations are more expensive than reads) + double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO; double MAX_TRANSACTIONS_PER_BYTE; @@ -634,6 +656,8 @@ public: double BW_LAG_DECREASE_AMOUNT; double BW_FETCH_WORKERS_INTERVAL; double BW_RW_LOGGING_INTERVAL; + double BW_MAX_BLOCKED_INTERVAL; + double BW_RK_SIM_QUIESCE_DELAY; // disk snapshot int64_t MAX_FORKED_PROCESS_OUTPUT; @@ -672,12 +696,14 @@ public: int STORAGE_LIMIT_BYTES; int BUGGIFY_LIMIT_BYTES; bool FETCH_USING_STREAMING; + bool FETCH_USING_BLOB; int FETCH_BLOCK_BYTES; int FETCH_KEYS_PARALLELISM_BYTES; int FETCH_KEYS_PARALLELISM; int FETCH_KEYS_PARALLELISM_FULL; int FETCH_KEYS_LOWER_PRIORITY; int SERVE_FETCH_CHECKPOINT_PARALLELISM; + int CHANGE_FEED_DISK_READS_PARALLELISM; int BUGGIFY_BLOCK_BYTES; int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT; double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD; @@ -716,7 +742,7 @@ public: int CHECKPOINT_TRANSFER_BLOCK_BYTES; int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; - bool STORAGE_SERVER_SHARD_AWARE; + int STORAGE_FEED_QUERY_HARD_LIMIT; // Wait Failure int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; @@ -750,6 +776,12 @@ public: bool WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER; // When enabled, the worker's health monitor also report any recent // destroyed peers who are part of the transaction system to // cluster controller. + bool STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT; // When enabled, storage server's worker will crash on io_timeout error; + // this allows fdbmonitor to restart the worker and recreate the same SS. + // When SS can be temporarily throttled by infrastructure, e.g, k8s, + // Enabling this can reduce toil of manually restarting the SS. + // Enable with caution: If io_timeout is caused by disk failure, we won't + // want to restart the SS, which increases risk of data corruption. // Test harness double WORKER_POLL_DELAY; @@ -763,6 +795,7 @@ public: // Dynamic Knobs (implementation) double COMPACTION_INTERVAL; + double BROADCASTER_SELF_UPDATE_DELAY; double GET_COMMITTED_VERSION_TIMEOUT; double GET_SNAPSHOT_AND_CHANGES_TIMEOUT; double FETCH_CHANGES_TIMEOUT; @@ -857,6 +890,7 @@ public: double REDWOOD_HISTOGRAM_INTERVAL; bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches + bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled // Server request latency measurement int LATENCY_SAMPLE_SIZE; @@ -871,6 +905,7 @@ public: int SIM_KMS_MAX_KEYS; int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH; bool ENABLE_TLOG_ENCRYPTION; + bool ENABLE_STORAGE_SERVER_ENCRYPTION; // Currently only Redwood engine supports encryption bool ENABLE_BLOB_GRANULE_ENCRYPTION; // Compression @@ -912,6 +947,7 @@ public: double BLOB_WORKER_BATCH_GRV_INTERVAL; bool BLOB_WORKER_DO_REJECT_WHEN_FULL; double BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD; + double BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY; double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN; double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX; @@ -919,6 +955,9 @@ public: int BLOB_MANAGER_CONCURRENT_MERGE_CHECKS; double BGCC_TIMEOUT; double BGCC_MIN_INTERVAL; + bool BLOB_MANIFEST_BACKUP; + double BLOB_MANIFEST_BACKUP_INTERVAL; + bool BLOB_FULL_RESTORE_MODE; // Blob metadata int64_t BLOB_METADATA_CACHE_TTL; diff --git a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h index 75cae1fc47..d2ce7f5cf9 100644 --- a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h @@ -60,6 +60,8 @@ public: // TODO : give this function a more descriptive name virtual bool isAsync() const { return false; } + virtual bool supportsTenants() const { return false; } + virtual ~SpecialKeyRangeReadImpl() {} protected: @@ -301,6 +303,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; + bool supportsTenants() const override { return true; }; }; class ReadConflictRangeImpl : public SpecialKeyRangeReadImpl { @@ -309,6 +312,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; + bool supportsTenants() const override { return true; }; }; class WriteConflictRangeImpl : public SpecialKeyRangeReadImpl { @@ -317,6 +321,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; + bool supportsTenants() const override { return true; }; }; class DDStatsRangeImpl : public SpecialKeyRangeAsyncImpl { diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index 2d56db7643..ce24d909a9 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -296,28 +296,28 @@ struct GetValueRequest : TimedRequest { Key key; Version version; Optional tags; - Optional debugID; ReplyPromise reply; + Optional options; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key + GetValueRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } - GetValueRequest() {} GetValueRequest(SpanContext spanContext, const TenantInfo& tenantInfo, const Key& key, Version ver, Optional tags, - Optional debugID, + Optional options, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), debugID(debugID), + : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), options(options), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, key, version, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions); + serializer(ar, key, version, tags, reply, spanContext, tenantInfo, options, ssLatestCommitVersions); } }; @@ -393,15 +393,14 @@ struct GetKeyValuesRequest : TimedRequest { KeyRef mapper = KeyRef(); Version version; // or latestVersion int limit, limitBytes; - bool isFetchKeys; Optional tags; - Optional debugID; + Optional options; ReplyPromise reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - GetKeyValuesRequest() : isFetchKeys(false) {} + GetKeyValuesRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -413,14 +412,13 @@ struct GetKeyValuesRequest : TimedRequest { version, limit, limitBytes, - isFetchKeys, tags, - debugID, reply, spanContext, tenantInfo, - arena, - ssLatestCommitVersions); + options, + ssLatestCommitVersions, + arena); } }; @@ -452,15 +450,14 @@ struct GetMappedKeyValuesRequest : TimedRequest { Version version; // or latestVersion int limit, limitBytes; int matchIndex; - bool isFetchKeys; Optional tags; - Optional debugID; + Optional options; ReplyPromise reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key range - GetMappedKeyValuesRequest() : isFetchKeys(false) {} + GetMappedKeyValuesRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -473,15 +470,14 @@ struct GetMappedKeyValuesRequest : TimedRequest { version, limit, limitBytes, - isFetchKeys, tags, - debugID, reply, spanContext, tenantInfo, - arena, + options, ssLatestCommitVersions, - matchIndex); + matchIndex, + arena); } }; @@ -520,15 +516,14 @@ struct GetKeyValuesStreamRequest { KeySelectorRef begin, end; Version version; // or latestVersion int limit, limitBytes; - bool isFetchKeys; Optional tags; - Optional debugID; + Optional options; ReplyPromiseStream reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key range - GetKeyValuesStreamRequest() : isFetchKeys(false) {} + GetKeyValuesStreamRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -540,14 +535,13 @@ struct GetKeyValuesStreamRequest { version, limit, limitBytes, - isFetchKeys, tags, - debugID, reply, spanContext, tenantInfo, - arena, - ssLatestCommitVersions); + options, + ssLatestCommitVersions, + arena); } }; @@ -573,29 +567,29 @@ struct GetKeyRequest : TimedRequest { KeySelectorRef sel; Version version; // or latestVersion Optional tags; - Optional debugID; ReplyPromise reply; + Optional options; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - bool verify() const { return tenantInfo.isAuthorized(); } - GetKeyRequest() {} + bool verify() const { return tenantInfo.isAuthorized(); } + GetKeyRequest(SpanContext spanContext, TenantInfo tenantInfo, KeySelectorRef const& sel, Version version, Optional tags, - Optional debugID, + Optional options, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), debugID(debugID), + : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), tags(tags), options(options), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, sel, version, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions); + serializer(ar, sel, version, tags, reply, spanContext, tenantInfo, options, ssLatestCommitVersions, arena); } }; @@ -771,7 +765,7 @@ struct SplitMetricsRequest { template void serialize(Ar& ar) { - serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena, minSplitBytes); + serializer(ar, keys, limits, used, estimated, isLastShard, reply, minSplitBytes, arena); } }; @@ -1051,7 +1045,7 @@ struct OverlappingChangeFeedsReply { template void serialize(Ar& ar) { - serializer(ar, feeds, arena, feedMetadataVersion); + serializer(ar, feeds, feedMetadataVersion, arena); } }; diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index b41809691e..068d1d9d37 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -28,7 +28,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobWorkerInterface.h" // TODO move the functions that depend on this out of here and into BlobWorkerInterface.h to remove this depdendency #include "fdbclient/StorageServerInterface.h" -#include "Tenant.h" +#include "fdbclient/Tenant.h" // Don't warn on constants being defined in this file. #pragma clang diagnostic push @@ -163,6 +163,9 @@ extern const KeyRef cacheChangePrefix; const Key cacheChangeKeyFor(uint16_t idx); uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key); +// For persisting the consistency scan configuration and metrics +extern const KeyRef consistencyScanInfoKey; + // "\xff/tss/[[serverId]]" := "[[tssId]]" extern const KeyRangeRef tssMappingKeys; @@ -273,6 +276,9 @@ extern const KeyRef perpetualStorageWiggleStatsPrefix; // Change the value of this key to anything and that will trigger detailed data distribution team info log. extern const KeyRef triggerDDTeamInfoPrintKey; +// Encryption data at-rest config key +extern const KeyRef encryptionAtRestModeConfKey; + // The differences between excluded and failed can be found in "command-line-interface.rst" // and in the help message of the fdbcli command "exclude". @@ -374,6 +380,12 @@ std::vector> decodeBackupStartedValue(const ValueRef& va // 1 = Send a signal to pause/already paused. extern const KeyRef backupPausedKey; +// "\xff/previousCoordinators" = "[[ClusterConnectionString]]" +// Set to the encoded structure of the cluster's previous set of coordinators. +// Changed when performing quorumChange. +// See "CoordinationInterface.h" struct ClusterConnectionString for more details +extern const KeyRef previousCoordinatorsKey; + // "\xff/coordinators" = "[[ClusterConnectionString]]" // Set to the encoded structure of the cluster's current set of coordinators. // Changed when performing quorumChange. diff --git a/fdbclient/include/fdbclient/TagThrottle.actor.h b/fdbclient/include/fdbclient/TagThrottle.actor.h index 020fcea568..cf4ef8fd16 100644 --- a/fdbclient/include/fdbclient/TagThrottle.actor.h +++ b/fdbclient/include/fdbclient/TagThrottle.actor.h @@ -264,9 +264,9 @@ Future getValidAutoEnabled(Reference tr) { tr->reset(); wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF)); continue; - } else if (value.get() == LiteralStringRef("1")) { + } else if (value.get() == "1"_sr) { result = true; - } else if (value.get() == LiteralStringRef("0")) { + } else if (value.get() == "0"_sr) { result = false; } else { TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue").detail("Value", value.get()); @@ -331,8 +331,7 @@ getThrottledTags(Reference db, int limit, ContainsRecommended containsRecomm template void signalThrottleChange(Reference tr) { - tr->atomicOp( - tagThrottleSignalKey, LiteralStringRef("XXXXXXXXXX\x00\x00\x00\x00"), MutationRef::SetVersionstampedValue); + tr->atomicOp(tagThrottleSignalKey, "XXXXXXXXXX\x00\x00\x00\x00"_sr, MutationRef::SetVersionstampedValue); } ACTOR template @@ -583,9 +582,8 @@ Future enableAuto(Reference db, bool enabled) { state typename DB::TransactionT::template FutureT> valueF = tr->get(tagThrottleAutoEnabledKey); Optional value = wait(safeThreadFutureToFuture(valueF)); - if (!value.present() || (enabled && value.get() != LiteralStringRef("1")) || - (!enabled && value.get() != LiteralStringRef("0"))) { - tr->set(tagThrottleAutoEnabledKey, LiteralStringRef(enabled ? "1" : "0")); + if (!value.present() || (enabled && value.get() != "1"_sr) || (!enabled && value.get() != "0"_sr)) { + tr->set(tagThrottleAutoEnabledKey, enabled ? "1"_sr : "0"_sr); signalThrottleChange(tr); wait(safeThreadFutureToFuture(tr->commit())); @@ -599,10 +597,8 @@ Future enableAuto(Reference db, bool enabled) { class TagQuotaValue { public: - double reservedReadQuota{ 0.0 }; - double totalReadQuota{ 0.0 }; - double reservedWriteQuota{ 0.0 }; - double totalWriteQuota{ 0.0 }; + double reservedQuota{ 0.0 }; + double totalQuota{ 0.0 }; bool isValid() const; Value toValue() const; static TagQuotaValue fromValue(ValueRef); @@ -611,17 +607,10 @@ public: Key getTagQuotaKey(TransactionTagRef); template -void setTagQuota(Reference tr, - TransactionTagRef tag, - double reservedReadQuota, - double totalReadQuota, - double reservedWriteQuota, - double totalWriteQuota) { +void setTagQuota(Reference tr, TransactionTagRef tag, double reservedQuota, double totalQuota) { TagQuotaValue tagQuotaValue; - tagQuotaValue.reservedReadQuota = reservedReadQuota; - tagQuotaValue.totalReadQuota = totalReadQuota; - tagQuotaValue.reservedWriteQuota = reservedWriteQuota; - tagQuotaValue.totalWriteQuota = totalWriteQuota; + tagQuotaValue.reservedQuota = reservedQuota; + tagQuotaValue.totalQuota = totalQuota; if (!tagQuotaValue.isValid()) { throw invalid_throttle_quota_value(); } diff --git a/fdbclient/include/fdbclient/TaskBucket.h b/fdbclient/include/fdbclient/TaskBucket.h index b7e6091d0f..b0b7a2bc51 100644 --- a/fdbclient/include/fdbclient/TaskBucket.h +++ b/fdbclient/include/fdbclient/TaskBucket.h @@ -115,7 +115,7 @@ public: }; struct ReservedTaskParams { - static TaskParam scheduledVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam scheduledVersion() { return __FUNCTION__sr; } }; class FutureBucket; @@ -480,7 +480,8 @@ struct TaskFuncBase : IDispatched, std::func }; #define REGISTER_TASKFUNC(TaskFunc) REGISTER_FACTORY(TaskFuncBase, TaskFunc, name) #define REGISTER_TASKFUNC_ALIAS(TaskFunc, Alias) \ - REGISTER_DISPATCHED_ALIAS(TaskFunc, Alias, TaskFunc::name, LiteralStringRef(#Alias)) + REGISTER_DISPATCHED_ALIAS( \ + TaskFunc, Alias, TaskFunc::name, StringRef(reinterpret_cast(#Alias), sizeof(#Alias) - 1)) struct TaskCompletionKey { Future get(Reference tr, Reference taskBucket); diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index 04280cafed..47d27a0f72 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -27,6 +27,7 @@ #include "fdbclient/VersionedMap.h" #include "fdbclient/KeyBackedTypes.h" #include "fdbrpc/TenantInfo.h" +#include "flow/BooleanParam.h" #include "flow/flat_buffers.h" typedef StringRef TenantNameRef; @@ -62,15 +63,22 @@ enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, R // Can be used in conjunction with the other tenant states above. enum class TenantLockState { UNLOCKED, READ_ONLY, LOCKED }; +constexpr int TENANT_PREFIX_SIZE = sizeof(int64_t); + +FDB_DECLARE_BOOLEAN_PARAM(EnforceValidTenantId); + struct TenantMapEntry { constexpr static FileIdentifier file_identifier = 12247338; static Key idToPrefix(int64_t id); - static int64_t prefixToId(KeyRef prefix); + static int64_t prefixToId(KeyRef prefix, EnforceValidTenantId enforceTenantId = EnforceValidTenantId::True); static std::string tenantStateToString(TenantState tenantState); static TenantState stringToTenantState(std::string stateStr); + static std::string tenantLockStateToString(TenantLockState tenantState); + static TenantLockState stringToTenantLockState(std::string stateStr); + int64_t id = -1; Key prefix; TenantState tenantState = TenantState::READY; @@ -91,7 +99,7 @@ struct TenantMapEntry { TenantMapEntry(int64_t id, TenantState tenantState, Optional tenantGroup, bool encrypted); void setId(int64_t id); - std::string toJson(int apiVersion) const; + std::string toJson() const; bool matchesConfiguration(TenantMapEntry const& other) const; void configure(Standalone parameter, Optional value); @@ -130,6 +138,8 @@ struct TenantGroupEntry { TenantGroupEntry() = default; TenantGroupEntry(Optional assignedCluster) : assignedCluster(assignedCluster) {} + json_spirit::mObject toJson() const; + Value encode() { return ObjectWriter::toValue(*this, IncludeVersion()); } static TenantGroupEntry decode(ValueRef const& value) { return ObjectReader::fromStringRef(value, IncludeVersion()); @@ -198,6 +208,6 @@ struct TenantMetadata { }; typedef VersionedMap TenantMap; -typedef VersionedMap TenantPrefixIndex; +class TenantPrefixIndex : public VersionedMap, public ReferenceCounted {}; #endif diff --git a/fdbclient/include/fdbclient/TenantEntryCache.actor.h b/fdbclient/include/fdbclient/TenantEntryCache.actor.h new file mode 100644 index 0000000000..cd35c5a985 --- /dev/null +++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h @@ -0,0 +1,390 @@ +/* + * TenantEntryCache.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANTENTRYCACHE_ACTOR_G_H) +#define FDBCLIENT_TENANTENTRYCACHE_ACTOR_G_H +#include "fdbclient/TenantEntryCache.actor.g.h" +#elif !defined(FDBCLIENT_TENANTENTRYCACHE_ACTOR_H) +#define FDBCLIENT_TENANTENTRYCACHE_ACTOR_H + +#pragma once + +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/RunTransaction.actor.h" +#include "fdbclient/Tenant.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/Knobs.h" +#include "fdbrpc/TenantName.h" +#include "flow/IndexedSet.h" + +#include +#include + +#include "flow/actorcompiler.h" // has to be last include + +using TenantNameEntryPair = std::pair; +using TenantNameEntryPairVec = std::vector; + +enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 }; +enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 }; + +template +struct TenantEntryCachePayload { + TenantName name; + TenantMapEntry entry; + // Custom client payload + T payload; +}; + +template +using TenantEntryCachePayloadFunc = std::function(const TenantName&, const TenantMapEntry&)>; + +// In-memory cache for TenantEntryMap objects. It supports three indices: +// 1. Lookup by 'TenantId' +// 2. Lookup by 'TenantPrefix' +// 3. Lookup by 'TenantName' +// +// TODO: +// ---- +// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic +// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement +// database range-watch to monitor such updates + +template +class TenantEntryCache : public ReferenceCounted>, NonCopyable { +private: + UID uid; + Database db; + TenantEntryCachePayloadFunc createPayloadFunc; + TenantEntryCacheRefreshMode refreshMode; + + Future refresher; + Map> mapByTenantId; + Map> mapByTenantName; + + CounterCollection metrics; + Counter hits; + Counter misses; + Counter refreshByCacheInit; + Counter refreshByCacheMiss; + Counter numRefreshes; + + ACTOR static Future getTenantList(Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + KeyBackedRangeResult> tenantList = + wait(TenantMetadata::tenantMap().getRange( + tr, Optional(), Optional(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + ASSERT(tenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantList.more); + + TraceEvent(SevDebug, "TenantEntryCacheGetTenantList").detail("Count", tenantList.results.size()); + + return tenantList.results; + } + + static void updateCacheRefreshMetrics(TenantEntryCache* cache, TenantEntryCacheRefreshReason reason) { + if (reason == TenantEntryCacheRefreshReason::INIT) { + cache->refreshByCacheInit += 1; + } else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) { + cache->refreshByCacheMiss += 1; + } + + cache->numRefreshes += 1; + } + + ACTOR static Future refreshImpl(TenantEntryCache* cache, TenantEntryCacheRefreshReason reason) { + TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast(reason)); + + state Reference tr = cache->getDatabase()->createTransaction(); + loop { + try { + state TenantNameEntryPairVec tenantList = wait(getTenantList(tr)); + + // Refresh cache entries reflecting the latest database state + cache->clear(); + for (auto& tenant : tenantList) { + cache->put(tenant); + } + + updateCacheRefreshMetrics(cache, reason); + break; + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled) { + TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id()) + .errorUnsuppressed(e) + .suppressFor(1.0); + } + wait(tr->onError(e)); + } + } + + TraceEvent(SevDebug, "TenantEntryCacheRefreshEnd", cache->id()).detail("Reason", static_cast(reason)); + + return Void(); + } + + ACTOR static Future>> getByIdImpl(TenantEntryCache* cache, + int64_t tenantId) { + Optional> ret = cache->lookupById(tenantId); + if (ret.present()) { + cache->hits += 1; + return ret; + } + + TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId); + + // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. + // TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any + // existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare + wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + + cache->misses += 1; + return cache->lookupById(tenantId); + } + + ACTOR static Future>> getByNameImpl(TenantEntryCache* cache, + TenantName name) { + Optional> ret = cache->lookupByName(name); + if (ret.present()) { + cache->hits += 1; + return ret; + } + + TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name); + + // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. + // TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any + // existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare + wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + + cache->misses += 1; + return cache->lookupByName(name); + } + + Optional> lookupById(int64_t tenantId) { + Optional> ret; + auto itr = mapByTenantId.find(tenantId); + if (itr == mapByTenantId.end()) { + return ret; + } + + return itr->value; + } + + Optional> lookupByName(TenantName name) { + Optional> ret; + auto itr = mapByTenantName.find(name); + if (itr == mapByTenantName.end()) { + return ret; + } + + return itr->value; + } + + Future refresh(TenantEntryCacheRefreshReason reason) { return refreshImpl(this, reason); } + + static TenantEntryCachePayload defaultCreatePayload(const TenantName& name, const TenantMapEntry& entry) { + TenantEntryCachePayload payload; + payload.name = name; + payload.entry = entry; + + return payload; + } + + Future removeEntryInt(Optional tenantId, + Optional tenantPrefix, + Optional tenantName, + bool refreshCache) { + typename Map>::iterator itrId; + typename Map>::iterator itrName; + + if (tenantId.present() || tenantPrefix.present()) { + // Ensure either tenantId OR tenantPrefix is valid (but not both) + ASSERT(tenantId.present() != tenantPrefix.present()); + ASSERT(!tenantName.present()); + + int64_t tId = tenantId.present() ? tenantId.get() : TenantMapEntry::prefixToId(tenantPrefix.get()); + TraceEvent("TenantEntryCacheRemoveEntry").detail("Id", tId); + itrId = mapByTenantId.find(tId); + if (itrId == mapByTenantId.end()) { + return Void(); + } + // Ensure byId and byName cache are in-sync + itrName = mapByTenantName.find(itrId->value.name); + ASSERT(itrName != mapByTenantName.end()); + } else if (tenantName.present()) { + ASSERT(!tenantId.present() && !tenantPrefix.present()); + + TraceEvent("TenantEntryCacheRemoveEntry").detail("Name", tenantName.get()); + itrName = mapByTenantName.find(tenantName.get()); + if (itrName == mapByTenantName.end()) { + return Void(); + } + // Ensure byId and byName cache are in-sync + itrId = mapByTenantId.find(itrName->value.entry.id); + ASSERT(itrId != mapByTenantId.end()); + } else { + // Invalid input, one of: tenantId, tenantPrefix or tenantName needs to be valid. + throw operation_failed(); + } + + ASSERT(itrId != mapByTenantId.end() && itrName != mapByTenantName.end()); + + TraceEvent("TenantEntryCacheRemoveEntry") + .detail("Id", itrId->key) + .detail("Prefix", itrId->value.entry.prefix) + .detail("Name", itrName->key); + + mapByTenantId.erase(itrId); + mapByTenantName.erase(itrName); + + if (refreshCache) { + return refreshImpl(this, TenantEntryCacheRefreshReason::REMOVE_ENTRY); + } + + return Void(); + } + +public: + TenantEntryCache(Database db) + : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload), + refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), metrics("TenantEntryCacheMetrics", uid.toString()), + hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), + refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid); + } + + TenantEntryCache(Database db, TenantEntryCachePayloadFunc fn) + : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(fn), + refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), metrics("TenantEntryCacheMetrics", uid.toString()), + hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), + refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreated", uid); + } + + TenantEntryCache(Database db, UID id, TenantEntryCachePayloadFunc fn) + : uid(id), db(db), createPayloadFunc(fn), refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), + metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics), + misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreated", uid); + } + + TenantEntryCache(Database db, UID id, TenantEntryCachePayloadFunc fn, TenantEntryCacheRefreshMode mode) + : uid(id), db(db), createPayloadFunc(fn), refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()), + hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), + refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreated", uid); + } + + Future init() { + TraceEvent("TenantEntryCacheInit", uid); + + Future f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT); + + // Launch reaper task to periodically refresh cache by scanning database KeyRange + TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK; + if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) { + refresher = recurringAsync([&, reason]() { return refresh(reason); }, + CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* interval */ + true, /* absoluteIntervalDelay */ + CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */ + TaskPriority::Worker); + } + + return f; + } + + Database getDatabase() const { return db; } + UID id() const { return uid; } + + void clear() { + mapByTenantId.clear(); + mapByTenantName.clear(); + } + + Future removeEntryById(int64_t tenantId, bool refreshCache = false) { + return removeEntryInt(tenantId, Optional(), Optional(), refreshCache); + } + Future removeEntryByPrefix(KeyRef tenantPrefix, bool refreshCache = false) { + return removeEntryInt(Optional(), tenantPrefix, Optional(), refreshCache); + } + Future removeEntryByName(TenantName tenantName, bool refreshCache = false) { + return removeEntryInt(Optional(), Optional(), tenantName, refreshCache); + } + + void put(const TenantNameEntryPair& pair) { + TenantEntryCachePayload payload = createPayloadFunc(pair.first, pair.second); + auto idItr = mapByTenantId.find(pair.second.id); + auto nameItr = mapByTenantName.find(pair.first); + + Optional existingName; + Optional existingId; + if (nameItr != mapByTenantName.end()) { + existingId = nameItr->value.entry.id; + mapByTenantId.erase(nameItr->value.entry.id); + } + if (idItr != mapByTenantId.end()) { + existingName = idItr->value.name; + mapByTenantName.erase(idItr->value.name); + } + + mapByTenantId[pair.second.id] = payload; + mapByTenantName[pair.first] = payload; + + TraceEvent("TenantEntryCachePut") + .detail("TenantName", pair.first) + .detail("TenantNameExisting", existingName) + .detail("TenantID", pair.second.id) + .detail("TenantIDExisting", existingId) + .detail("TenantPrefix", pair.second.prefix); + + CODE_PROBE(idItr == mapByTenantId.end() && nameItr == mapByTenantName.end(), "TenantCache new entry"); + CODE_PROBE(idItr != mapByTenantId.end() && nameItr == mapByTenantName.end(), "TenantCache entry name updated"); + CODE_PROBE(idItr == mapByTenantId.end() && nameItr != mapByTenantName.end(), "TenantCache entry id updated"); + CODE_PROBE(idItr != mapByTenantId.end() && nameItr != mapByTenantName.end(), + "TenantCache entry id and name updated"); + } + + Future>> getById(int64_t tenantId) { return getByIdImpl(this, tenantId); } + Future>> getByPrefix(KeyRef prefix) { + int64_t id = TenantMapEntry::prefixToId(prefix); + return getByIdImpl(this, id); + } + Future>> getByName(TenantName name) { return getByNameImpl(this, name); } + + // Counter access APIs + Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); } + Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); } + Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); } +}; + +#include "flow/unactorcompiler.h" +#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H \ No newline at end of file diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index b6a33e3d1f..6e91c8fb90 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -271,6 +271,50 @@ Future> createTenant(Reference db, } } +ACTOR template +Future markTenantTombstones(Transaction tr, int64_t tenantId) { + // In data clusters, we store a tombstone + state Future> latestTombstoneFuture = + TenantMetadata::tenantTombstones().getRange(tr, {}, {}, 1, Snapshot::False, Reverse::True); + state Optional cleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr)); + state Version transactionReadVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); + + // If it has been long enough since we last cleaned up the tenant tombstones, we do that first + if (!cleanupData.present() || cleanupData.get().nextTombstoneEraseVersion <= transactionReadVersion) { + state int64_t deleteThroughId = cleanupData.present() ? cleanupData.get().nextTombstoneEraseId : -1; + // Delete all tombstones up through the one currently marked in the cleanup data + if (deleteThroughId >= 0) { + TenantMetadata::tenantTombstones().erase(tr, 0, deleteThroughId + 1); + } + + KeyBackedRangeResult latestTombstone = wait(latestTombstoneFuture); + int64_t nextDeleteThroughId = std::max(deleteThroughId, tenantId); + if (!latestTombstone.results.empty()) { + nextDeleteThroughId = std::max(nextDeleteThroughId, latestTombstone.results[0]); + } + + // The next cleanup will happen at or after TENANT_TOMBSTONE_CLEANUP_INTERVAL seconds have elapsed and + // will clean up tombstones through the most recently allocated ID. + TenantTombstoneCleanupData updatedCleanupData; + updatedCleanupData.tombstonesErasedThrough = deleteThroughId; + updatedCleanupData.nextTombstoneEraseId = nextDeleteThroughId; + updatedCleanupData.nextTombstoneEraseVersion = + transactionReadVersion + + CLIENT_KNOBS->TENANT_TOMBSTONE_CLEANUP_INTERVAL * CLIENT_KNOBS->VERSIONS_PER_SECOND; + + TenantMetadata::tombstoneCleanupData().set(tr, updatedCleanupData); + + // If the tenant being deleted is within the tombstone window, record the tombstone + if (tenantId > updatedCleanupData.tombstonesErasedThrough) { + TenantMetadata::tenantTombstones().insert(tr, tenantId); + } + } else if (tenantId > cleanupData.get().tombstonesErasedThrough) { + // If the tenant being deleted is within the tombstone window, record the tombstone + TenantMetadata::tenantTombstones().insert(tr, tenantId); + } + return Void(); +} + // Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same // ID. If no matching tenant is found, this function returns without deleting anything. This behavior allows the // function to be used idempotently: if the transaction is retried after having succeeded, it will see that the tenant @@ -320,45 +364,7 @@ Future deleteTenantTransaction(Transaction tr, } if (clusterType == ClusterType::METACLUSTER_DATA) { - // In data clusters, we store a tombstone - state Future> latestTombstoneFuture = - TenantMetadata::tenantTombstones().getRange(tr, {}, {}, 1, Snapshot::False, Reverse::True); - state Optional cleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr)); - state Version transactionReadVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); - - // If it has been long enough since we last cleaned up the tenant tombstones, we do that first - if (!cleanupData.present() || cleanupData.get().nextTombstoneEraseVersion <= transactionReadVersion) { - state int64_t deleteThroughId = cleanupData.present() ? cleanupData.get().nextTombstoneEraseId : -1; - // Delete all tombstones up through the one currently marked in the cleanup data - if (deleteThroughId >= 0) { - TenantMetadata::tenantTombstones().erase(tr, 0, deleteThroughId + 1); - } - - KeyBackedRangeResult latestTombstone = wait(latestTombstoneFuture); - int64_t nextDeleteThroughId = std::max(deleteThroughId, tenantId.get()); - if (!latestTombstone.results.empty()) { - nextDeleteThroughId = std::max(nextDeleteThroughId, latestTombstone.results[0]); - } - - // The next cleanup will happen at or after TENANT_TOMBSTONE_CLEANUP_INTERVAL seconds have elapsed and - // will clean up tombstones through the most recently allocated ID. - TenantTombstoneCleanupData updatedCleanupData; - updatedCleanupData.tombstonesErasedThrough = deleteThroughId; - updatedCleanupData.nextTombstoneEraseId = nextDeleteThroughId; - updatedCleanupData.nextTombstoneEraseVersion = - transactionReadVersion + - CLIENT_KNOBS->TENANT_TOMBSTONE_CLEANUP_INTERVAL * CLIENT_KNOBS->VERSIONS_PER_SECOND; - - TenantMetadata::tombstoneCleanupData().set(tr, updatedCleanupData); - - // If the tenant being deleted is within the tombstone window, record the tombstone - if (tenantId.get() > updatedCleanupData.tombstonesErasedThrough) { - TenantMetadata::tenantTombstones().insert(tr, tenantId.get()); - } - } else if (tenantId.get() > cleanupData.get().tombstonesErasedThrough) { - // If the tenant being deleted is within the tombstone window, record the tombstone - TenantMetadata::tenantTombstones().insert(tr, tenantId.get()); - } + wait(markTenantTombstones(tr, tenantId.get())); } return Void(); @@ -456,8 +462,8 @@ Future configureTenantTransaction(Transaction tr, ACTOR template Future>> listTenantsTransaction(Transaction tr, - TenantNameRef begin, - TenantNameRef end, + TenantName begin, + TenantName end, int limit) { tr->setOption(FDBTransactionOptions::RAW_ACCESS); @@ -488,18 +494,32 @@ Future>> listTenants(Reference } ACTOR template -Future renameTenantTransaction(Transaction tr, TenantNameRef oldName, TenantNameRef newName) { +Future renameTenantTransaction(Transaction tr, + TenantName oldName, + TenantName newName, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE, + Optional configureSequenceNum = Optional()) { + ASSERT(clusterType == ClusterType::STANDALONE || (tenantId.present() && configureSequenceNum.present())); + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); + wait(checkTenantMode(tr, clusterType)); tr->setOption(FDBTransactionOptions::RAW_ACCESS); state Optional oldEntry; state Optional newEntry; wait(store(oldEntry, tryGetTenantTransaction(tr, oldName)) && store(newEntry, tryGetTenantTransaction(tr, newName))); - if (!oldEntry.present()) { + if (!oldEntry.present() || (tenantId.present() && tenantId.get() != oldEntry.get().id)) { throw tenant_not_found(); } if (newEntry.present()) { throw tenant_already_exists(); } + if (configureSequenceNum.present()) { + if (oldEntry.get().configurationSequenceNum >= configureSequenceNum.get()) { + return Void(); + } + oldEntry.get().configurationSequenceNum = configureSequenceNum.get(); + } TenantMetadata::tenantMap().erase(tr, oldName); TenantMetadata::tenantMap().set(tr, newName, oldEntry.get()); TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName); @@ -511,12 +531,21 @@ Future renameTenantTransaction(Transaction tr, TenantNameRef oldName, Tena Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName)); } + if (clusterType == ClusterType::METACLUSTER_DATA) { + wait(markTenantTombstones(tr, tenantId.get())); + } + return Void(); } ACTOR template -Future renameTenant(Reference db, TenantName oldName, TenantName newName) { +Future renameTenant(Reference db, + TenantName oldName, + TenantName newName, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { state Reference tr = db->createTransaction(); + ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present()); state bool firstTry = true; state int64_t id; @@ -560,7 +589,7 @@ Future renameTenant(Reference db, TenantName oldName, TenantName newNa throw tenant_not_found(); } } - wait(renameTenantTransaction(tr, oldName, newName)); + wait(renameTenantTransaction(tr, oldName, newName, tenantId, clusterType)); wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); TraceEvent("RenameTenantSuccess").detail("OldName", oldName).detail("NewName", newName); return Void(); @@ -569,6 +598,62 @@ Future renameTenant(Reference db, TenantName oldName, TenantName newNa } } } + +template +Future> tryGetTenantGroupTransaction(Transaction tr, TenantGroupName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + return TenantMetadata::tenantGroupMap().get(tr, name); +} + +ACTOR template +Future> tryGetTenantGroup(Reference db, TenantGroupName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional entry = wait(tryGetTenantGroupTransaction(tr, name)); + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future>> listTenantGroupsTransaction(Transaction tr, + TenantGroupName begin, + TenantGroupName end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(TenantMetadata::tenantGroupMap().getRange(tr, begin, end, limit)); + + return results.results; +} + +ACTOR template +Future>> listTenantGroups(Reference db, + TenantGroupName begin, + TenantGroupName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + std::vector> tenantGroups = + wait(listTenantGroupsTransaction(tr, begin, end, limit)); + return tenantGroups; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + } // namespace TenantAPI #include "flow/unactorcompiler.h" diff --git a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h index af9d02c371..cf190fc77d 100644 --- a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h +++ b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h @@ -36,11 +36,8 @@ #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // This must be the last #include. -template class TenantRangeImpl : public SpecialKeyRangeRWImpl { private: - static bool subRangeIntersects(KeyRangeRef subRange, KeyRangeRef range); - static KeyRangeRef removePrefix(KeyRangeRef range, KeyRef prefix, KeyRef defaultEnd) { KeyRef begin = range.begin.removePrefix(prefix); KeyRef end; @@ -76,7 +73,7 @@ private: wait(TenantAPI::listTenantsTransaction(&ryw->getTransaction(), kr.begin, kr.end, limitsHint.rows)); for (auto tenant : tenants) { - std::string jsonString = tenant.second.toJson(ryw->getDatabase()->apiVersion); + std::string jsonString = tenant.second.toJson(); ValueRef tenantEntryBytes(results->arena(), jsonString); results->push_back(results->arena(), KeyValueRef(withTenantMapPrefix(tenant.first, results->arena()), tenantEntryBytes)); @@ -85,21 +82,20 @@ private: return Void(); } - ACTOR template - static Future getTenantRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) { + ACTOR static Future getTenantRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) { state RangeResult results; kr = kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .removePrefix(TenantRangeImpl::submoduleRange.begin); + .removePrefix(TenantRangeImpl::submoduleRange.begin); - if (kr.intersects(TenantRangeImpl::mapSubRange)) { + if (kr.intersects(TenantRangeImpl::mapSubRange)) { GetRangeLimits limits = limitsHint; limits.decrement(results); wait(getTenantList( ryw, - removePrefix(kr & TenantRangeImpl::mapSubRange, TenantRangeImpl::mapSubRange.begin, "\xff"_sr), + removePrefix(kr & TenantRangeImpl::mapSubRange, TenantRangeImpl::mapSubRange.begin, "\xff"_sr), &results, limits)); } @@ -254,11 +250,8 @@ private: } public: - // These ranges vary based on the template parameter - const static KeyRangeRef submoduleRange; - const static KeyRangeRef mapSubRange; - - // These sub-ranges should only be used if HasSubRanges=true + const inline static KeyRangeRef submoduleRange = KeyRangeRef("tenant/"_sr, "tenant0"_sr); + const inline static KeyRangeRef mapSubRange = KeyRangeRef("map/"_sr, "map0"_sr); const inline static KeyRangeRef configureSubRange = KeyRangeRef("configure/"_sr, "configure0"_sr); const inline static KeyRangeRef renameSubRange = KeyRangeRef("rename/"_sr, "rename0"_sr); @@ -267,7 +260,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override { - return getTenantRange(ryw, kr, limitsHint); + return getTenantRange(ryw, kr, limitsHint); } ACTOR static Future> commitImpl(TenantRangeImpl* self, ReadYourWritesTransaction* ryw) { @@ -301,11 +294,11 @@ public: .removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) .removePrefix(submoduleRange.begin); - if (subRangeIntersects(mapSubRange, adjustedRange)) { + if (mapSubRange.intersects(adjustedRange)) { adjustedRange = mapSubRange & adjustedRange; adjustedRange = removePrefix(adjustedRange, mapSubRange.begin, "\xff"_sr); mapMutations.push_back(std::make_pair(adjustedRange, range.value().second)); - } else if (subRangeIntersects(configureSubRange, adjustedRange) && adjustedRange.singleKeyRange()) { + } else if (configureSubRange.intersects(adjustedRange) && adjustedRange.singleKeyRange()) { StringRef configTupleStr = adjustedRange.begin.removePrefix(configureSubRange.begin); try { Tuple tuple = Tuple::unpack(configTupleStr); @@ -320,7 +313,7 @@ public: false, "configure tenant", "invalid tenant configuration key")); throw special_keys_api_failure(); } - } else if (subRangeIntersects(renameSubRange, adjustedRange)) { + } else if (renameSubRange.intersects(adjustedRange)) { StringRef oldName = adjustedRange.begin.removePrefix(renameSubRange.begin); StringRef newName = range.value().second.get(); // Do not allow overlapping renames in the same commit diff --git a/fdbclient/include/fdbclient/ThreadSafeTransaction.h b/fdbclient/include/fdbclient/ThreadSafeTransaction.h index a069c648be..0d7c6f608d 100644 --- a/fdbclient/include/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/include/fdbclient/ThreadSafeTransaction.h @@ -20,6 +20,7 @@ #ifndef FDBCLIENT_THREADSAFETRANSACTION_H #define FDBCLIENT_THREADSAFETRANSACTION_H +#include "flow/ApiVersion.h" #include "flow/ProtocolVersion.h" #pragma once @@ -164,6 +165,22 @@ public: Optional readVersion, ReadBlobGranuleContext granuleContext) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + + ThreadFuture>> summarizeBlobGranules(const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) override; + void addReadConflictRange(const KeyRangeRef& keys) override; void makeSelfConflicting(); @@ -230,7 +247,7 @@ private: friend IClientApi* getLocalClientAPI(); ThreadSafeApi(); - int apiVersion; + ApiVersion apiVersion; std::string clientVersion; uint64_t transportId; diff --git a/fdbclient/include/fdbclient/Tracing.h b/fdbclient/include/fdbclient/Tracing.h index 30fa210e3f..789b346dfd 100644 --- a/fdbclient/include/fdbclient/Tracing.h +++ b/fdbclient/include/fdbclient/Tracing.h @@ -50,8 +50,7 @@ struct SpanContext { SpanContext() : traceID(UID()), spanID(0), m_Flags(TraceFlags::unsampled) {} SpanContext(UID traceID, uint64_t spanID, TraceFlags flags) : traceID(traceID), spanID(spanID), m_Flags(flags) {} SpanContext(UID traceID, uint64_t spanID) : traceID(traceID), spanID(spanID), m_Flags(TraceFlags::unsampled) {} - SpanContext(Arena arena, const SpanContext& span) - : traceID(span.traceID), spanID(span.spanID), m_Flags(span.m_Flags) {} + SpanContext(const SpanContext& span) = default; bool isSampled() const { return (m_Flags & TraceFlags::sampled) == TraceFlags::sampled; } std::string toString() const { return format("%016llx%016llx%016llx", traceID.first(), traceID.second(), spanID); }; bool isValid() const { return traceID.first() != 0 && traceID.second() != 0 && spanID != 0; } @@ -62,6 +61,9 @@ struct SpanContext { } }; +template <> +struct flow_ref : std::false_type {}; + // Span // // Span is a tracing implementation which, for the most part, complies with the W3C Trace Context specification @@ -155,7 +157,7 @@ public: // We've determined for initial tracing release, spans with only a location will not be traced. // Generally these are for background processes, some are called infrequently, while others may be high volume. // TODO: review and address in subsequent PRs. - Span(const Location& location) : location(location), begin(g_network->now()) {} + explicit Span(const Location& location) : Span(location, SpanContext()) {} Span(const Span&) = delete; Span(Span&& o) { diff --git a/fdbclient/include/fdbclient/Tuple.h b/fdbclient/include/fdbclient/Tuple.h index 9d5594ccba..a1363b6d34 100644 --- a/fdbclient/include/fdbclient/Tuple.h +++ b/fdbclient/include/fdbclient/Tuple.h @@ -25,7 +25,7 @@ #include "flow/flow.h" #include "fdbclient/FDBTypes.h" -#include "fdbclient/Versionstamp.h" +#include "fdbclient/TupleVersionstamp.h" struct Tuple { struct UnicodeStr { @@ -33,6 +33,14 @@ struct Tuple { explicit UnicodeStr(StringRef str) : str(str) {} }; + struct UserTypeStr { + uint8_t code; + Standalone str; + UserTypeStr(uint8_t code, StringRef str) : code(code), str(str) {} + + bool operator==(const UserTypeStr& other) const { return (code == other.code && str == other.str); } + }; + Tuple() {} // Tuple parsing normally does not care of the final value is a numeric type and is incomplete. @@ -40,6 +48,7 @@ struct Tuple { // Note that strings can't be incomplete because they are parsed such that the end of the packed // byte string is considered the end of the string in lieu of a specific end. static Tuple unpack(StringRef const& str, bool exclude_incomplete = false); + static Tuple unpackUserType(StringRef const& str, bool exclude_incomplete = false); Tuple& append(Tuple const& tuple); @@ -54,7 +63,8 @@ struct Tuple { Tuple& append(double); Tuple& append(std::nullptr_t); Tuple& appendNull(); - Tuple& append(Versionstamp const&); + Tuple& append(TupleVersionstamp const&); + Tuple& append(UserTypeStr const&); Standalone pack() const { return Standalone(StringRef(data.begin(), data.size()), data.arena()); @@ -65,24 +75,29 @@ struct Tuple { return append(t); } - enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE, VERSIONSTAMP }; + enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE, VERSIONSTAMP, USER_TYPE }; + + bool isUserType(uint8_t code) const; // this is number of elements, not length of data size_t size() const { return offsets.size(); } void reserve(size_t cap) { offsets.reserve(cap); } void clear() { - data.clear(); + // Make a new Standalone to use different memory so that + // previously returned objects from pack() are valid. + data = Standalone>(); offsets.clear(); } // Return a Tuple encoded raw string. StringRef subTupleRawString(size_t index) const; ElementType getType(size_t index) const; Standalone getString(size_t index) const; - Versionstamp getVersionstamp(size_t index) const; + TupleVersionstamp getVersionstamp(size_t index) const; int64_t getInt(size_t index, bool allow_incomplete = false) const; bool getBool(size_t index) const; float getFloat(size_t index) const; double getDouble(size_t index) const; + Tuple::UserTypeStr getUserType(size_t index) const; KeyRange range(Tuple const& tuple = Tuple()) const; @@ -105,7 +120,7 @@ struct Tuple { } private: - Tuple(const StringRef& data, bool exclude_incomplete = false); + Tuple(const StringRef& data, bool exclude_incomplete = false, bool exclude_user_type = false); Standalone> data; std::vector offsets; }; diff --git a/fdbclient/include/fdbclient/Versionstamp.h b/fdbclient/include/fdbclient/TupleVersionstamp.h similarity index 88% rename from fdbclient/include/fdbclient/Versionstamp.h rename to fdbclient/include/fdbclient/TupleVersionstamp.h index b3fd6770c7..730e7647f6 100644 --- a/fdbclient/include/fdbclient/Versionstamp.h +++ b/fdbclient/include/fdbclient/TupleVersionstamp.h @@ -1,5 +1,5 @@ /* - * Versionstamp.h + * TupleVersionstamp.h * * This source file is part of the FoundationDB open source project * @@ -27,15 +27,15 @@ const size_t VERSIONSTAMP_TUPLE_SIZE = 12; -struct Versionstamp { - Versionstamp(StringRef); +struct TupleVersionstamp { + TupleVersionstamp(StringRef); int64_t getVersion() const; int16_t getBatchNumber() const; int16_t getUserVersion() const; size_t size() const; const uint8_t* begin() const; - bool operator==(const Versionstamp&) const; + bool operator==(const TupleVersionstamp&) const; private: Standalone data; diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 82bdf40cfe..8df1bcc150 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -57,6 +57,8 @@ description is not currently required but encouraged.