diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..c72f249028 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +ignore = E203, E266, E501, W503, F403, F401, E711 +max-line-length = 79 +max-complexity = 18 +select = B,C,E,F,W,T4,B9 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 57f424d17b..0fe8a7c92a 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ packaging/msi/obj simfdb tests/oldBinaries trace.*.xml +trace.*.json .venv # Editor files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..a843a71aac --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: +- repo: https://github.com/psf/black + rev: 2018e667a6a36ee3fbfa8041cd36512f92f60d49 # frozen: 22.8.0 + hooks: + - id: black +- repo: https://github.com/pycqa/flake8 + rev: f8e1b317742036ff11ff86356fd2b68147e169f7 # frozen: 5.0.4 + hooks: + - id: flake8 \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 59161d79b6..4b631b5df4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,11 @@ else() cmake_minimum_required(VERSION 3.13) endif() +# silence deprecation warnings in newer versions of cmake +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + project(foundationdb VERSION 7.2.0 DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." @@ -206,7 +211,7 @@ endif() if (CMAKE_EXPORT_COMPILE_COMMANDS AND WITH_PYTHON) add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json - COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/contrib/gen_compile_db.py + COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/contrib/gen_compile_db.py ARGS -b ${CMAKE_CURRENT_BINARY_DIR} -s ${CMAKE_CURRENT_SOURCE_DIR} -o ${CMAKE_CURRENT_SOURCE_DIR}/compile_commands.json ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/gen_compile_db.py ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json COMMENT "Build compile commands for IDE" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9549297305..f142b3dfae 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ If you have questions, we encourage you to engage in discussion on the [communit ## Before you get started ### Community Guidelines -We want the FoundationDB community to be as welcoming and inclusive as possible, and have adopted a [Code of Conduct](CODE_OF_CONDUCT.md) that we ask all community members to read and observe. +We want the FoundationDB community to be as welcoming and inclusive as possible, and have adopted a [Code of Conduct](CODE_OF_CONDUCT.md) that we ask all community members to read and abide by. ### Project Licensing By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the Apache 2.0 license. @@ -34,10 +34,13 @@ Members of the Apple FoundationDB team are part of the core committers helping r ## Contributing ### Opening a Pull Request -We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines. +We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to the [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines. CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer. Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them. +### Python pre-commit +We use a pre-commit pipeline with black and flake8 to enforce python best coding practices. Install pre-commit ```pip install pre-commit```. Install it in your FoundationDB directory ```pre-commit install```. + ### Reporting issues Please refer to the section below on [using GitHub issues and the community forums](#using-github-issues-and-community-forums) for more info. @@ -46,10 +49,10 @@ To report a security issue, please **DO NOT** start by filing a public issue or ## Project Communication ### Community Forums -We encourage your participation asking questions and helping improve the FoundationDB project. Check out the [FoundationDB community forums](https://forums.foundationdb.org), which serve a similar function as mailing lists in many open source projects. The forums are organized into three sections: +We encourage your participation asking questions and helping improve the FoundationDB project. Check out the [FoundationDB community forums](https://forums.foundationdb.org), which serve a similar function as mailing lists in many open source projects. The forums are organized into three categories: * [Development](https://forums.foundationdb.org/c/development): For discussing the internals and development of the FoundationDB core, as well as layers. -* [Using FoundationDB](https://forums.foundationdb.org/c/using-foundationdb): For discussing user-facing topics. Getting started and have a question? This is the place for you. +* [Using FoundationDB](https://forums.foundationdb.org/c/using-foundationdb): For discussing user-facing topics. Getting started and have a question? This is the category for you. * [Site Feedback](https://forums.foundationdb.org/c/site-feedback): A category for discussing the forums and the OSS project, its organization, how it works, and how we can improve it. ### Using GitHub Issues and Community Forums @@ -63,4 +66,4 @@ GitHub Issues should be used for tracking tasks. If you know the specific code t * Implementing an agreed upon feature: *GitHub Issues* ### Project and Development Updates -Stay connected to the project and the community! For project and community updates, follow the [FoundationDB project blog](https://www.foundationdb.org/blog/). Development announcements will be made via the community forums' [dev-announce](https://forums.foundationdb.org/c/development/dev-announce) section. +Stay connected to the project and the community! For project and community updates, follow the [FoundationDB project blog](https://www.foundationdb.org/blog/). Development announcements will be made via the community forums' [dev-announce](https://forums.foundationdb.org/c/development/dev-announce) category. diff --git a/FDBLibTLS/FDBLibTLSPolicy.cpp b/FDBLibTLS/FDBLibTLSPolicy.cpp index 6f81f91335..d97932659b 100644 --- a/FDBLibTLS/FDBLibTLSPolicy.cpp +++ b/FDBLibTLS/FDBLibTLSPolicy.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/FDBLibTLS/FDBLibTLSVerify.cpp b/FDBLibTLS/FDBLibTLSVerify.cpp index 216966f4c0..4aeea07c15 100644 --- a/FDBLibTLS/FDBLibTLSVerify.cpp +++ b/FDBLibTLS/FDBLibTLSVerify.cpp @@ -28,6 +28,7 @@ #include #include #include +#include static int hexValue(char c) { static char const digits[] = "0123456789ABCDEF"; diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index 508ede8998..a5de827fe9 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -49,6 +49,17 @@ from bindingtester.known_testers import Tester import fdb import fdb.tuple + +API_VERSIONS = [ + 13, 14, 16, 21, 22, 23, + 100, 200, 300, + 400, 410, 420, 430, 440, 450, 460, + 500, 510, 520, + 600, 610, 620, 630, + 700, 710, 720, +] + + fdb.api_version(FDB_API_VERSION) @@ -156,8 +167,7 @@ def choose_api_version(selected_api_version, tester_min_version, tester_max_vers elif random.random() < 0.7: api_version = min_version elif random.random() < 0.9: - api_version = random.choice([v for v in [13, 14, 16, 21, 22, 23, 100, 200, 300, 400, 410, 420, 430, - 440, 450, 460, 500, 510, 520, 600, 610, 620, 630, 700, 710, 720] if v >= min_version and v <= max_version]) + api_version = random.choice([v for v in API_VERSIONS if v >= min_version and v <= max_version]) else: api_version = random.randint(min_version, max_version) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 00803634e7..e0a1fc31bb 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -1,8 +1,8 @@ set(FDB_C_SRCS - fdb_c.cpp - foundationdb/fdb_c.h - foundationdb/fdb_c_internal.h - foundationdb/fdb_c_types.h) + fdb_c.cpp + foundationdb/fdb_c.h + foundationdb/fdb_c_internal.h + foundationdb/fdb_c_types.h) file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/foundationdb) @@ -10,50 +10,50 @@ set(asm_file ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.g.S) set(os "linux") set(cpu "intel") -if (APPLE) +if(APPLE) set(os "osx") -elseif (WIN32) +elseif(WIN32) set(os "windows") set(asm_file ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.g.asm) -endif () +endif() -if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") set(cpu "aarch64") -elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le|powerpc64le)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le|powerpc64le)") set(cpu "ppc64le") -endif () +endif() set(IS_ARM_MAC NO) -if (APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") +if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") set(IS_ARM_MAC YES) -endif () +endif() add_custom_command(OUTPUT ${asm_file} ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h - COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${os} ${cpu} - ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp - ${asm_file} - ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp - COMMENT "Generate C bindings") + COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${os} ${cpu} + ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp + ${asm_file} + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate_asm.py ${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.cpp + COMMENT "Generate C bindings") add_custom_target(fdb_c_generated DEPENDS ${asm_file} - ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h) + ${CMAKE_CURRENT_BINARY_DIR}/fdb_c_function_pointers.g.h) vexillographer_compile(TARGET fdb_c_options LANG c OUT ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h) + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h) include(GenerateExportHeader) -if (OPEN_FOR_IDE) +if(OPEN_FOR_IDE) add_library(fdb_c OBJECT ${FDB_C_SRCS} ${asm_file}) -else () +else() add_library(fdb_c SHARED ${FDB_C_SRCS} ${asm_file}) strip_debug_symbols(fdb_c) -endif () +endif() add_dependencies(fdb_c fdb_c_generated fdb_c_options) add_dependencies(fdbclient fdb_c_options) add_dependencies(fdbclient_sampling fdb_c_options) target_link_libraries(fdb_c PRIVATE $) -if (USE_UBSAN) +if(USE_UBSAN) # The intent of this hack is to force c targets that depend on fdb_c to use # c++ as their linker language. Otherwise you see undefined references to c++ # specific ubsan symbols. @@ -61,104 +61,108 @@ if (USE_UBSAN) set_property(TARGET force_cxx_linker PROPERTY IMPORTED_LOCATION /dev/null) set_target_properties(force_cxx_linker PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES CXX) target_link_libraries(fdb_c PUBLIC $) -endif () -if (APPLE) +endif() +if(APPLE) set(symbols ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.symbols) add_custom_command(OUTPUT ${symbols} - COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/symbolify.py - ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h - ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h - ${symbols} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symbolify.py ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h - COMMENT "Generate exported_symbols_list") + COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/symbolify.py + ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h + ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h + ${symbols} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symbolify.py ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c.h ${CMAKE_CURRENT_SOURCE_DIR}/foundationdb/fdb_c_internal.h + COMMENT "Generate exported_symbols_list") add_custom_target(exported_symbols_list DEPENDS ${symbols}) add_dependencies(fdb_c exported_symbols_list) target_link_options(fdb_c PRIVATE "LINKER:-no_weak_exports,-exported_symbols_list,${symbols}") -elseif (WIN32) -else () - if (NOT USE_UBSAN) +elseif(WIN32) +else() + if(NOT USE_UBSAN) # For ubsan we need to export type information for the vptr check to work. # Otherwise we only want to export fdb symbols in the fdb c api. target_link_options(fdb_c PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map") - endif () + endif() target_link_options(fdb_c PRIVATE "LINKER:-z,nodelete,-z,noexecstack") -endif () +endif() target_include_directories(fdb_c PUBLIC - $ - $ - $) -if (WIN32) + $ + $ + $) +if(WIN32) enable_language(ASM_MASM) set_property(SOURCE ${asm_file} PROPERTY LANGUAGE ASM_MASM) -endif () +endif() # The tests don't build on windows -if (NOT WIN32) +if(NOT WIN32) set(MAKO_SRCS - test/mako/async.hpp - test/mako/async.cpp - test/mako/blob_granules.hpp - test/mako/blob_granules.cpp - test/mako/future.hpp - test/mako/limit.hpp - test/mako/logger.hpp - test/mako/mako.cpp - test/mako/mako.hpp - test/mako/operations.hpp - test/mako/operations.cpp - test/mako/process.hpp - test/mako/shm.hpp - test/mako/stats.hpp - test/mako/time.hpp - test/mako/utils.cpp - test/mako/utils.hpp) + test/mako/async.hpp + test/mako/async.cpp + test/mako/blob_granules.hpp + test/mako/blob_granules.cpp + test/mako/future.hpp + test/mako/limit.hpp + test/mako/logger.hpp + test/mako/mako.cpp + test/mako/mako.hpp + test/mako/operations.hpp + test/mako/operations.cpp + test/mako/process.hpp + test/mako/shm.hpp + test/mako/stats.hpp + test/mako/time.hpp + test/mako/utils.cpp + test/mako/utils.hpp) add_subdirectory(test/unit/third_party) find_package(Threads REQUIRED) set(UNIT_TEST_SRCS - test/unit/unit_tests.cpp - test/unit/fdb_api.cpp - test/unit/fdb_api.hpp) + test/unit/unit_tests.cpp + test/unit/fdb_api.cpp + test/unit/fdb_api.hpp) set(UNIT_TEST_VERSION_510_SRCS test/unit/unit_tests_version_510.cpp) set(TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS test/unit/trace_partial_file_suffix_test.cpp) set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS - test/unit/disconnected_timeout_tests.cpp - test/unit/fdb_api.cpp - test/unit/fdb_api.hpp) + test/unit/disconnected_timeout_tests.cpp + test/unit/fdb_api.cpp + test/unit/fdb_api.hpp) add_library(fdb_cpp INTERFACE test/fdb_api.hpp) target_sources(fdb_cpp INTERFACE) target_include_directories(fdb_cpp INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/test) - target_link_libraries(fdb_cpp INTERFACE fmt::fmt) + target_link_libraries(fdb_cpp INTERFACE fdb_c fmt::fmt) set(API_TESTER_SRCS - test/apitester/fdb_c_api_tester.cpp - test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp - test/apitester/TesterApiWorkload.cpp - test/apitester/TesterApiWorkload.h - test/apitester/TesterTestSpec.cpp - test/apitester/TesterTestSpec.h - test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp - test/apitester/TesterCancelTransactionWorkload.cpp - test/apitester/TesterCorrectnessWorkload.cpp - test/apitester/TesterKeyValueStore.cpp - test/apitester/TesterKeyValueStore.h - test/apitester/TesterOptions.h - test/apitester/TesterScheduler.cpp - test/apitester/TesterScheduler.h - test/apitester/TesterTransactionExecutor.cpp - test/apitester/TesterTransactionExecutor.h - test/apitester/TesterUtil.cpp - test/apitester/TesterUtil.h - test/apitester/TesterWatchAndWaitWorkload.cpp - test/apitester/TesterWorkload.cpp - test/apitester/TesterWorkload.h - ) + test/apitester/fdb_c_api_tester.cpp + test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp + test/apitester/TesterApiWorkload.cpp + test/apitester/TesterApiWorkload.h + test/apitester/TesterTestSpec.cpp + test/apitester/TesterTestSpec.h + test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp + test/apitester/TesterBlobGranuleErrorsWorkload.cpp + test/apitester/TesterBlobGranuleUtil.cpp + test/apitester/TesterBlobGranuleUtil.h + test/apitester/TesterCancelTransactionWorkload.cpp + test/apitester/TesterCorrectnessWorkload.cpp + test/apitester/TesterExampleWorkload.cpp + test/apitester/TesterKeyValueStore.cpp + test/apitester/TesterKeyValueStore.h + test/apitester/TesterOptions.h + test/apitester/TesterScheduler.cpp + test/apitester/TesterScheduler.h + test/apitester/TesterTransactionExecutor.cpp + test/apitester/TesterTransactionExecutor.h + test/apitester/TesterUtil.cpp + test/apitester/TesterUtil.h + test/apitester/TesterWatchAndWaitWorkload.cpp + test/apitester/TesterWorkload.cpp + test/apitester/TesterWorkload.h + ) add_library(fdb_c_unit_tests_impl OBJECT ${UNIT_TEST_SRCS}) add_library(fdb_c_api_tester_impl OBJECT ${API_TESTER_SRCS}) - if (OPEN_FOR_IDE) + if(OPEN_FOR_IDE) add_library(fdb_c_performance_test OBJECT test/performance_test.c test/test.h) add_library(fdb_c_ryw_benchmark OBJECT test/ryw_benchmark.c test/test.h) add_library(fdb_c_txn_size_test OBJECT test/txn_size_test.c test/test.h) @@ -168,7 +172,7 @@ if (NOT WIN32) add_library(fdb_c_unit_tests_version_510 OBJECT ${UNIT_TEST_VERSION_510_SRCS}) add_library(trace_partial_file_suffix_test OBJECT ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS}) add_library(disconnected_timeout_unit_tests OBJECT ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS}) - else () + else() add_executable(fdb_c_performance_test test/performance_test.c test/test.h) add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h) add_executable(fdb_c_txn_size_test test/txn_size_test.c test/test.h) @@ -186,7 +190,7 @@ if (NOT WIN32) strip_debug_symbols(fdb_c_ryw_benchmark) strip_debug_symbols(fdb_c_txn_size_test) strip_debug_symbols(fdb_c_client_memory_test) - endif () + endif() target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads) target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads) @@ -194,11 +198,10 @@ if (NOT WIN32) target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads) target_include_directories(fdb_c_api_tester_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include ${CMAKE_BINARY_DIR}/flow/include) - if (USE_SANITIZER) - target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_asan) - else () - target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target) - endif () + target_link_libraries(fdb_c_api_tester_impl PRIVATE fdb_cpp toml11_target Threads::Threads fmt::fmt boost_target) + if(NOT APPLE) + target_link_libraries(fdb_c_api_tester_impl PRIVATE stdc++fs) + endif() target_link_libraries(fdb_c_api_tester_impl PRIVATE SimpleOpt) target_include_directories(fdb_c_unit_tests_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/) @@ -211,253 +214,305 @@ if (NOT WIN32) # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) - if (USE_SANITIZER) - target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_asan rapidjson) - else () - target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target rapidjson) - endif () + target_link_libraries(mako PRIVATE fdb_c fdbclient fmt::fmt Threads::Threads fdb_cpp boost_target rapidjson) - if (NOT OPEN_FOR_IDE) + if(NOT OPEN_FOR_IDE) # Make sure that fdb_c.h is compatible with c90 add_executable(fdb_c90_test test/fdb_c90_test.c) set_property(TARGET fdb_c90_test PROPERTY C_STANDARD 90) target_compile_options(fdb_c90_test PRIVATE -Wall -Wextra -Wpedantic -Werror) target_link_libraries(fdb_c90_test PRIVATE fdb_c) - endif () + endif() - if (OPEN_FOR_IDE) + if(OPEN_FOR_IDE) set(FDB_C_TARGET $) - else () + else() set(FDB_C_TARGET $) add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - COMMAND ${CMAKE_COMMAND} -E copy ${FDB_C_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - DEPENDS fdb_c - COMMENT "Copy libfdb_c to use as external client for test") + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + COMMAND ${CMAKE_COMMAND} -E copy ${FDB_C_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + DEPENDS fdb_c + COMMENT "Copy libfdb_c to use as external client for test") add_custom_target(external_client DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so) add_dependencies(fdb_c_unit_tests_impl external_client) add_dependencies(disconnected_timeout_unit_tests external_client) add_dependencies(fdb_c_api_tester_impl external_client) add_fdbclient_test( - NAME fdb_c_setup_tests - COMMAND $) + NAME fdb_c_setup_tests + COMMAND $) add_fdbclient_test( - NAME fdb_c_unit_tests - COMMAND $ - @CLUSTER_FILE@ - fdb) + NAME fdb_c_unit_tests + COMMAND $ + @CLUSTER_FILE@ + fdb) add_fdbclient_test( - NAME fdb_c_unit_tests_version_510 - COMMAND $ - @CLUSTER_FILE@ - fdb) + NAME fdb_c_unit_tests_version_510 + COMMAND $ + @CLUSTER_FILE@ + fdb) add_fdbclient_test( - NAME trace_partial_file_suffix_test - COMMAND $ - @CLUSTER_FILE@ - fdb) + NAME trace_partial_file_suffix_test + COMMAND $ + @CLUSTER_FILE@ + fdb) add_fdbclient_test( - NAME fdb_c_external_client_unit_tests - COMMAND $ - @CLUSTER_FILE@ - fdb - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + NAME fdb_c_external_client_unit_tests + COMMAND $ + @CLUSTER_FILE@ + fdb + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so ) add_unavailable_fdbclient_test( - NAME disconnected_timeout_unit_tests - COMMAND $ - @CLUSTER_FILE@ + NAME disconnected_timeout_unit_tests + COMMAND $ + @CLUSTER_FILE@ ) add_unavailable_fdbclient_test( - NAME disconnected_timeout_external_client_unit_tests - COMMAND $ - @CLUSTER_FILE@ - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + NAME disconnected_timeout_external_client_unit_tests + COMMAND $ + @CLUSTER_FILE@ + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so ) add_fdbclient_test( - NAME fdb_c_api_tests - DISABLE_LOG_DUMP - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $ - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ + NAME fdb_c_api_tests + DISABLE_LOG_DUMP + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ ) add_fdbclient_test( - NAME fdb_c_api_tests_blob_granule - DISABLE_LOG_DUMP - API_TEST_BLOB_GRANULES_ENABLED - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $ - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests - --blob-granule-local-file-path - @DATA_DIR@/fdbblob/ - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ + NAME fdb_c_api_tests_local_only + DISABLE_LOG_DUMP + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/local_tests + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ ) add_fdbclient_test( - NAME fdb_c_api_tests_with_tls - DISABLE_LOG_DUMP - TLS_ENABLED - COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py - --cluster-file - @CLUSTER_FILE@ - --tester-binary - $ - --external-client-library - ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so - --test-dir - ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - --tmp-dir - @TMP_DIR@ - --log-dir - @LOG_DIR@ - --tls-cert-file - @CLIENT_CERT_FILE@ - --tls-key-file - @CLIENT_KEY_FILE@ - --tls-ca-file - @SERVER_CA_FILE@ + NAME fdb_c_api_tests_blob_granule + DISABLE_LOG_DUMP + API_TEST_BLOB_GRANULES_ENABLED + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/blobgranuletests + --blob-granule-local-file-path + @DATA_DIR@/fdbblob/ + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ ) - if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER) + add_fdbclient_test( + NAME fdb_c_api_tests_with_tls + DISABLE_LOG_DUMP + TLS_ENABLED + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + --tmp-dir + @TMP_DIR@ + --log-dir + @LOG_DIR@ + --tls-cert-file + @CLIENT_CERT_FILE@ + --tls-key-file + @CLIENT_KEY_FILE@ + --tls-ca-file + @SERVER_CA_FILE@ + ) + + add_test(NAME fdb_c_upgrade_to_future_version + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --process-number 3 + ) + set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") + + add_test(NAME fdb_c_upgrade_to_future_version_blob_granules + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml + --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --blob-granules-enabled + --process-number 3 + ) + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER) add_test(NAME fdb_c_upgrade_single_threaded_630api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" - --process-number 1 - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml + --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" + --process-number 1 + ) add_test(NAME fdb_c_upgrade_single_threaded_700api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "7.0.0" "7.1.9" "7.2.0" - --process-number 1 - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml + --upgrade-path "7.0.0" "7.1.9" "7.2.0" + --process-number 1 + ) add_test(NAME fdb_c_upgrade_multi_threaded_630api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" "7.1.9" - --process-number 3 - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "6.3.23" "7.0.0" "7.1.9" "7.2.0" "7.1.9" + --process-number 3 + ) add_test(NAME fdb_c_upgrade_multi_threaded_700api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.0.0" "7.1.9" "7.2.0" "7.1.9" - --process-number 3 - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.0.0" "7.1.9" "7.2.0" "7.1.9" + --process-number 3 + ) add_test(NAME fdb_c_upgrade_multi_threaded_710api - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.9" "7.2.0" "7.1.9" - --process-number 3 - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.1.9" "7.2.0" "7.1.9" + --process-number 3 + ) add_test(NAME fdb_c_cluster_wiggle - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.2.0" "wiggle" - --disable-log-dump - --process-number 3 - --redundancy double - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.2.0" "wiggle" + --disable-log-dump + --process-number 3 + --redundancy double + ) add_test(NAME fdb_c_wiggle_and_upgrade_latest - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.9" "wiggle" "7.2.0" - --disable-log-dump - --process-number 3 - --redundancy double - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.1.9" "wiggle" "7.2.0" + --disable-log-dump + --process-number 3 + --redundancy double + ) add_test(NAME fdb_c_wiggle_and_upgrade_63 - COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py - --build-dir ${CMAKE_BINARY_DIR} - --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.24" "wiggle" "7.0.0" - --disable-log-dump - --process-number 3 - --redundancy double - ) + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "6.3.24" "wiggle" "7.0.0" + --disable-log-dump + --process-number 3 + --redundancy double + ) - endif () - - endif () -endif () + endif() + endif() +endif() set(c_workloads_srcs - test/workloads/workloads.cpp - test/workloads/workloads.h - test/workloads/SimpleWorkload.cpp) + test/workloads/workloads.cpp + test/workloads/workloads.h + test/workloads/SimpleWorkload.cpp) -if (OPEN_FOR_IDE) +if(OPEN_FOR_IDE) add_library(c_workloads OBJECT ${c_workloads_srcs}) -else () +else() add_library(c_workloads SHARED ${c_workloads_srcs}) -endif () +endif() set_target_properties(c_workloads PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb") + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb") target_link_libraries(c_workloads PUBLIC fdb_c) -if (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) +if(NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) target_link_options(c_workloads PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/external_workload.map,-z,nodelete") -endif () +endif() # Generate shim library in Linux builds -if (OPEN_FOR_IDE) +if(OPEN_FOR_IDE) - add_library(fdb_c_shim OBJECT fdb_c_shim.cpp) + add_library(fdb_c_shim OBJECT foundationdb/fdb_c_shim.h fdb_c_shim.cpp) target_link_libraries(fdb_c_shim PUBLIC dl) + target_include_directories(fdb_c_shim PUBLIC + $ + $ + $) -elseif (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only + add_library(fdb_c_shim_lib_tester OBJECT test/shim_lib_tester.cpp) + target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads) + target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include) + +elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer only set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(SHIM_LIB_GEN_SRC - ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.init.c - ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.tramp.S) + ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.init.c + ${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.tramp.S) + + set(IMPLIBSO_SRC_DIR ${CMAKE_SOURCE_DIR}/contrib/Implib.so) + set(IMPLIBSO_SRC + ${IMPLIBSO_SRC_DIR}/implib-gen.py + ${IMPLIBSO_SRC_DIR}/arch/common/init.c.tpl + ${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/config.ini + ${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/table.S.tpl + ${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/trampoline.S.tpl + ) add_custom_command(OUTPUT ${SHIM_LIB_GEN_SRC} - COMMAND $ ${CMAKE_SOURCE_DIR}/contrib/Implib.so/implib-gen.py - --target ${CMAKE_SYSTEM_PROCESSOR} - --outdir ${SHIM_LIB_OUTPUT_DIR} - --dlopen-callback=fdb_shim_dlopen_callback - $) + COMMAND $ ${IMPLIBSO_SRC_DIR}/implib-gen.py + --target ${CMAKE_SYSTEM_PROCESSOR} + --outdir ${SHIM_LIB_OUTPUT_DIR} + --dlopen-callback=fdb_shim_dlopen_callback + $ + DEPENDS ${IMPLIBSO_SRC} fdb_c + COMMENT "Generating source code for C shim library") - add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} fdb_c_shim.cpp) + add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp) target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack") target_link_libraries(fdb_c_shim PUBLIC dl) + target_include_directories(fdb_c_shim PUBLIC + $ + $ + $) add_executable(fdb_c_shim_unit_tests) target_link_libraries(fdb_c_shim_unit_tests PRIVATE fdb_c_shim fdb_c_unit_tests_impl) @@ -465,15 +520,20 @@ elseif (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE) # Linux Only add_executable(fdb_c_shim_api_tester) target_link_libraries(fdb_c_shim_api_tester PRIVATE fdb_c_shim fdb_c_api_tester_impl) - add_test(NAME fdb_c_shim_library_tests - COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py - --build-dir ${CMAKE_BINARY_DIR} - --unit-tests-bin $ - --api-tester-bin $ - --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests - ) + add_executable(fdb_c_shim_lib_tester test/shim_lib_tester.cpp) + target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads) + target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include) -endif () # End Linux only + add_test(NAME fdb_c_shim_library_tests + COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py + --build-dir ${CMAKE_BINARY_DIR} + --unit-tests-bin $ + --api-tester-bin $ + --shim-lib-tester-bin $ + --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + ) + +endif() # End Linux only, non-sanitizer only # TODO: re-enable once the old vcxproj-based build system is removed. #generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT" @@ -485,35 +545,51 @@ set(version_config "${generated_dir}/${targets_export_name}ConfigVersion.cmake") set(project_config "${generated_dir}/${targets_export_name}Config.cmake") include(CMakePackageConfigHelpers) write_basic_package_version_file( - "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY AnyNewerVersion + "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY AnyNewerVersion ) configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY) fdb_install( - TARGETS fdb_c - EXPORT ${targets_export_name} - DESTINATION lib - COMPONENT clients) + TARGETS fdb_c + EXPORT ${targets_export_name} + DESTINATION lib + COMPONENT clients) fdb_install( - FILES foundationdb/fdb_c.h - ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h - ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options - ${CMAKE_SOURCE_DIR}/bindings/c/foundationdb/fdb_c_types.h - DESTINATION include - DESTINATION_SUFFIX /foundationdb - COMPONENT clients) + FILES foundationdb/fdb_c.h + ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_options.g.h + ${CMAKE_SOURCE_DIR}/fdbclient/vexillographer/fdb.options + ${CMAKE_SOURCE_DIR}/bindings/c/foundationdb/fdb_c_types.h + DESTINATION include + DESTINATION_SUFFIX /foundationdb + COMPONENT clients) fdb_install( - FILES "${project_config}" "${version_config}" - DESTINATION lib - DESTINATION_SUFFIX "/cmake/${targets_export_name}" - COMPONENT clients) + FILES "${project_config}" "${version_config}" + DESTINATION lib + DESTINATION_SUFFIX "/cmake/${targets_export_name}" + COMPONENT clients) fdb_configure_and_install( - FILE "${PROJECT_SOURCE_DIR}/cmake/foundationdb-client.pc.in" - DESTINATION lib - DESTINATION_SUFFIX "/pkgconfig" - COMPONENT clients) + FILE "${PROJECT_SOURCE_DIR}/cmake/foundationdb-client.pc.in" + DESTINATION lib + DESTINATION_SUFFIX "/pkgconfig" + COMPONENT clients) fdb_install( - EXPORT ${targets_export_name} - DESTINATION lib - DESTINATION_SUFFIX "/cmake/${targets_export_name}" - COMPONENT clients) + EXPORT ${targets_export_name} + DESTINATION lib + DESTINATION_SUFFIX "/cmake/${targets_export_name}" + COMPONENT clients) + +if(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-sanitizer only + + fdb_install( + FILES foundationdb/fdb_c_shim.h + DESTINATION include + DESTINATION_SUFFIX /foundationdb + COMPONENT clients) + + fdb_install( + TARGETS fdb_c_shim + EXPORT ${targets_export_name} + DESTINATION lib + COMPONENT clients) + +endif() # End Linux only, non-ubsan only diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index c97604b98c..4b225ddd80 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -79,9 +79,10 @@ extern "C" DLLEXPORT fdb_bool_t fdb_error_predicate(int predicate_test, fdb_erro if (predicate_test == FDBErrorPredicates::RETRYABLE_NOT_COMMITTED) { return code == error_code_not_committed || code == error_code_transaction_too_old || code == error_code_future_version || code == error_code_database_locked || - code == error_code_proxy_memory_limit_exceeded || code == error_code_batch_transaction_throttled || - code == error_code_process_behind || code == error_code_tag_throttled || - code == error_code_unknown_tenant; + code == error_code_grv_proxy_memory_limit_exceeded || + code == error_code_commit_proxy_memory_limit_exceeded || + code == error_code_batch_transaction_throttled || code == error_code_process_behind || + code == error_code_tag_throttled || code == error_code_unknown_tenant; } return false; } @@ -238,6 +239,10 @@ fdb_error_t fdb_future_get_version_v619(FDBFuture* f, int64_t* out_version) { CATCH_AND_RETURN(*out_version = TSAV(Version, f)->get();); } +extern "C" DLLEXPORT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out_value) { + CATCH_AND_RETURN(*out_value = TSAV(bool, f)->get();); +} + extern "C" DLLEXPORT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out_value) { CATCH_AND_RETURN(*out_value = TSAV(int64_t, f)->get();); } @@ -319,6 +324,15 @@ extern "C" DLLEXPORT fdb_error_t fdb_future_get_key_array(FDBFuture* f, FDBKey c *out_count = na.size();); } +extern "C" DLLEXPORT fdb_error_t fdb_future_get_granule_summary_array(FDBFuture* f, + FDBGranuleSummary const** out_ranges, + int* out_count) { + CATCH_AND_RETURN(Standalone> na = + TSAV(Standalone>, f)->get(); + *out_ranges = (FDBGranuleSummary*)na.begin(); + *out_count = na.size();); +} + extern "C" DLLEXPORT void fdb_result_destroy(FDBResult* r) { CATCH_AND_DIE(TSAVB(r)->cancel();); } @@ -493,6 +507,58 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDat FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr()); } +extern "C" DLLEXPORT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length) { + return (FDBFuture*)(DB(db) + ->blobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length))) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length) { + return (FDBFuture*)(DB(db) + ->unblobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length))) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int rangeLimit) { + return (FDBFuture*)(DB(db) + ->listBlobbifiedRanges(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length)), + rangeLimit) + .extractPtr()); +} + +extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t version) { + Optional rv; + if (version != latestVersion) { + rv = version; + } + return (FDBFuture*)(DB(db) + ->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length)), + rv) + .extractPtr()); +} + extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) { CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr();); } @@ -855,11 +921,12 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTrans uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, - int end_key_name_length) { + int end_key_name_length, + int rangeLimit) { RETURN_FUTURE_ON_ERROR( Standalone>, KeyRangeRef range(KeyRef(begin_key_name, begin_key_name_length), KeyRef(end_key_name, end_key_name_length)); - return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range).extractPtr());); + return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range, rangeLimit).extractPtr());); } extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* tr, @@ -889,6 +956,74 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr());); } +extern "C" DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut) { + Optional rv; + if (readVersion != latestVersion) { + rv = readVersion; + } + return (FDBFuture*)(TXN(tr) + ->readBlobGranulesStart(KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length), + KeyRef(end_key_name, end_key_name_length)), + beginVersion, + rv, + readVersionOut) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr, + FDBFuture* f, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granule_context) { + // FIXME: better way to convert? + ReadBlobGranuleContext context; + context.userContext = granule_context->userContext; + context.start_load_f = granule_context->start_load_f; + context.get_load_f = granule_context->get_load_f; + context.free_load_f = granule_context->free_load_f; + context.debugNoMaterialize = granule_context->debugNoMaterialize; + context.granuleParallelism = granule_context->granuleParallelism; + ThreadFuture>> startFuture( + TSAV(Standalone>, f)); + + return (FDBResult*)(TXN(tr) + ->readBlobGranulesFinish(startFuture, + KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length), + KeyRef(end_key_name, end_key_name_length)), + beginVersion, + readVersion, + context) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_transaction_summarize_blob_granules(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t summaryVersion, + int rangeLimit) { + RETURN_FUTURE_ON_ERROR( + Standalone>, + KeyRangeRef range(KeyRef(begin_key_name, begin_key_name_length), KeyRef(end_key_name, end_key_name_length)); + + Optional sv; + if (summaryVersion != latestVersion) { sv = summaryVersion; } + + return (FDBFuture*)(TXN(tr)->summarizeBlobGranules(range, sv, rangeLimit).extractPtr());); +} + #include "fdb_c_function_pointers.g.h" #define FDB_API_CHANGED(func, ver) \ @@ -964,6 +1099,10 @@ extern "C" DLLEXPORT const char* fdb_get_client_version() { return API->getClientVersion(); } +extern "C" DLLEXPORT void fdb_use_future_protocol_version() { + API->useFutureProtocolVersion(); +} + #if defined(__APPLE__) #include __attribute__((constructor)) static void initialize() { diff --git a/bindings/c/fdb_c_shim.cpp b/bindings/c/fdb_c_shim.cpp index 01c6bf9e6c..855db0313f 100644 --- a/bindings/c/fdb_c_shim.cpp +++ b/bindings/c/fdb_c_shim.cpp @@ -20,25 +20,42 @@ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__)) +#define DLLEXPORT __attribute__((visibility("default"))) + +#include "foundationdb/fdb_c_shim.h" + #include -#include -#include #include -static const char* FDB_C_CLIENT_LIBRARY_PATH = "FDB_C_CLIENT_LIBRARY_PATH"; +namespace { -// Callback that tries different library names +const char* FDB_LOCAL_CLIENT_LIBRARY_PATH_ENVVAR = "FDB_LOCAL_CLIENT_LIBRARY_PATH"; +std::string g_fdbLocalClientLibraryPath; + +} // namespace + +extern "C" DLLEXPORT void fdb_shim_set_local_client_library_path(const char* filePath) { + g_fdbLocalClientLibraryPath = filePath; +} + +/* The callback of the fdb_c_shim layer that determines the path + of the fdb_c library to be dynamically loaded + */ extern "C" void* fdb_shim_dlopen_callback(const char* libName) { std::string libPath; - char* val = getenv(FDB_C_CLIENT_LIBRARY_PATH); - if (val) { - libPath = val; + if (!g_fdbLocalClientLibraryPath.empty()) { + libPath = g_fdbLocalClientLibraryPath; } else { - libPath = libName; + char* val = getenv(FDB_LOCAL_CLIENT_LIBRARY_PATH_ENVVAR); + if (val) { + libPath = val; + } else { + libPath = libName; + } } return dlopen(libPath.c_str(), RTLD_LAZY | RTLD_GLOBAL); } #else #error Port me! -#endif \ No newline at end of file +#endif diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 8b874c7ddf..2e4e977d76 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -70,17 +70,26 @@ DLLEXPORT fdb_bool_t fdb_error_predicate(int predicate_test, fdb_error_t code); #define /* fdb_error_t */ fdb_select_api_version(v) fdb_select_api_version_impl(v, FDB_API_VERSION) +/* + * A variant of fdb_select_api_version that caps the header API version by the maximum API version + * supported by the client library. It is intended mainly for use in combination with the shim + * layer, which loads the client library dynamically. + */ +#define /* fdb_error_t */ fdb_select_api_version_capped(v) \ + fdb_select_api_version_impl( \ + v, FDB_API_VERSION < fdb_get_max_api_version() ? FDB_API_VERSION : fdb_get_max_api_version()) + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_network_set_option(FDBNetworkOption option, uint8_t const* value, int value_length); #if FDB_API_VERSION >= 14 -DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_setup_network(); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_setup_network(void); #endif -DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_run_network(); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_run_network(void); -DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_stop_network(); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_stop_network(void); DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_add_network_thread_completion_hook(void (*hook)(void*), void* hook_parameter); @@ -170,6 +179,14 @@ typedef struct keyrange { const uint8_t* end_key; int end_key_length; } FDBKeyRange; + +typedef struct granulesummary { + FDBKeyRange key_range; + int64_t snapshot_version; + int64_t snapshot_size; + int64_t delta_version; + int64_t delta_size; +} FDBGranuleSummary; #pragma pack(pop) typedef struct readgranulecontext { @@ -218,6 +235,8 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_set_callback(FDBFuture* f, DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_error(FDBFuture* f); #endif +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out); + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out); DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_uint64(FDBFuture* f, uint64_t* out); @@ -253,6 +272,10 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_keyrange_array(FDBFuture FDBKeyRange const** out_ranges, int* out_count); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_granule_summary_array(FDBFuture* f, + FDBGranuleSummary const** out_summaries, + int* out_count); + /* FDBResult is a synchronous computation result, as opposed to a future that is asynchronous. */ DLLEXPORT void fdb_result_destroy(FDBResult* r); @@ -312,6 +335,32 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complet uint8_t const* purge_key_name, int purge_key_name_length); +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int rangeLimit); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t version); + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction); @@ -470,7 +519,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges( uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, - int end_key_name_length); + int end_key_name_length, + int rangeLimit); /* LatestVersion (-2) for readVersion means get read version from transaction Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */ @@ -483,6 +533,14 @@ DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTr int64_t readVersion, FDBReadBlobGranuleContext granuleContext); +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_summarize_blob_granules(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t summaryVersion, + int rangeLimit); + #define FDB_KEYSEL_LAST_LESS_THAN(k, l) k, l, 0, 0 #define FDB_KEYSEL_LAST_LESS_OR_EQUAL(k, l) k, l, 1, 0 #define FDB_KEYSEL_FIRST_GREATER_THAN(k, l) k, l, 1, 1 @@ -490,8 +548,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTr DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_select_api_version_impl(int runtime_version, int header_version); -DLLEXPORT int fdb_get_max_api_version(); -DLLEXPORT const char* fdb_get_client_version(); +DLLEXPORT int fdb_get_max_api_version(void); +DLLEXPORT const char* fdb_get_client_version(void); /* LEGACY API VERSIONS */ diff --git a/bindings/c/foundationdb/fdb_c_internal.h b/bindings/c/foundationdb/fdb_c_internal.h index 2b1a2163c7..62b77f354e 100644 --- a/bindings/c/foundationdb/fdb_c_internal.h +++ b/bindings/c/foundationdb/fdb_c_internal.h @@ -49,6 +49,29 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_shared_state(FDBFuture* DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_create_database_from_connection_string(const char* connection_string, FDBDatabase** out_database); +DLLEXPORT void fdb_use_future_protocol_version(); + +// the logical read_blob_granules is broken out (at different points depending on the client type) into the asynchronous +// start() that happens on the fdb network thread, and synchronous finish() that happens off it +DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut); + +DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr, + FDBFuture* f, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granuleContext); + #ifdef __cplusplus } #endif diff --git a/bindings/c/foundationdb/fdb_c_shim.h b/bindings/c/foundationdb/fdb_c_shim.h new file mode 100644 index 0000000000..44dfdc7348 --- /dev/null +++ b/bindings/c/foundationdb/fdb_c_shim.h @@ -0,0 +1,47 @@ +/* + * fdb_shim_c.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDB_SHIM_C_H +#define FDB_SHIM_C_H +#pragma once + +#ifndef DLLEXPORT +#define DLLEXPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Specify the path of the local libfdb_c.so library to be dynamically loaded by the shim layer + * + * This enables running the same application code with different client library versions, + * e.g. using the latest development build for testing new features, but still using the latest + * stable release in production deployments. + * + * The given path overrides the environment variable FDB_LOCAL_CLIENT_LIBRARY_PATH + */ +DLLEXPORT void fdb_shim_set_local_client_library_path(const char* filePath); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/bindings/c/test/apitester/TesterApiWorkload.cpp b/bindings/c/test/apitester/TesterApiWorkload.cpp index c1499adb0c..90d4e18bd2 100644 --- a/bindings/c/test/apitester/TesterApiWorkload.cpp +++ b/bindings/c/test/apitester/TesterApiWorkload.cpp @@ -41,6 +41,10 @@ ApiWorkload::ApiWorkload(const WorkloadConfig& config) : WorkloadBase(config) { stopReceived = false; checkingProgress = false; apiVersion = config.apiVersion; + + for (int i = 0; i < config.numTenants; ++i) { + tenants.push_back(fdb::ByteString(fdb::toBytesRef("tenant" + std::to_string(i)))); + } } IWorkloadControlIfc* ApiWorkload::getControlIfc() { @@ -107,49 +111,57 @@ void ApiWorkload::randomOperation(TTaskFct cont) { } fdb::Key ApiWorkload::randomKeyName() { - return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength); + return keyPrefix + Random::get().randomByteStringLowerCase(minKeyLength, maxKeyLength); } fdb::Value ApiWorkload::randomValue() { - return Random::get().randomStringLowerCase(minValueLength, maxValueLength); + return Random::get().randomByteStringLowerCase(minValueLength, maxValueLength); } -fdb::Key ApiWorkload::randomNotExistingKey() { +fdb::Key ApiWorkload::randomNotExistingKey(std::optional tenantId) { while (true) { fdb::Key key = randomKeyName(); - if (!store.exists(key)) { + if (!stores[tenantId].exists(key)) { return key; } } } -fdb::Key ApiWorkload::randomExistingKey() { +fdb::Key ApiWorkload::randomExistingKey(std::optional tenantId) { fdb::Key genKey = randomKeyName(); - fdb::Key key = store.getKey(genKey, true, 1); - if (key != store.endKey()) { + fdb::Key key = stores[tenantId].getKey(genKey, true, 1); + if (key != stores[tenantId].endKey()) { return key; } - key = store.getKey(genKey, true, 0); - if (key != store.startKey()) { + key = stores[tenantId].getKey(genKey, true, 0); + if (key != stores[tenantId].startKey()) { return key; } info("No existing key found, using a new random key."); return genKey; } -fdb::Key ApiWorkload::randomKey(double existingKeyRatio) { +fdb::Key ApiWorkload::randomKey(double existingKeyRatio, std::optional tenantId) { if (Random::get().randomBool(existingKeyRatio)) { - return randomExistingKey(); + return randomExistingKey(tenantId); } else { - return randomNotExistingKey(); + return randomNotExistingKey(tenantId); } } -void ApiWorkload::populateDataTx(TTaskFct cont) { +std::optional ApiWorkload::randomTenant() { + if (tenants.size() > 0) { + return Random::get().randomInt(0, tenants.size() - 1); + } else { + return {}; + } +} + +void ApiWorkload::populateDataTx(TTaskFct cont, std::optional tenantId) { int numKeys = maxKeysPerTransaction; auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(), randomValue() }); + kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(tenantId), randomValue() }); } execTransaction( [kvPairs](auto ctx) { @@ -158,12 +170,29 @@ void ApiWorkload::populateDataTx(TTaskFct cont) { } ctx->commit(); }, - [this, kvPairs, cont]() { + [this, tenantId, kvPairs, cont]() { for (const fdb::KeyValue& kv : *kvPairs) { - store.set(kv.key, kv.value); + stores[tenantId].set(kv.key, kv.value); } schedule(cont); - }); + }, + getTenant(tenantId)); +} + +void ApiWorkload::clearTenantData(TTaskFct cont, std::optional tenantId) { + execTransaction( + [this](auto ctx) { + ctx->tx().clearRange(keyPrefix, keyPrefix + fdb::Key(1, '\xff')); + ctx->commit(); + }, + [this, tenantId, cont]() { + if (tenantId && tenantId.value() < tenants.size() - 1) { + clearTenantData(cont, tenantId.value() + 1); + } else { + schedule(cont); + } + }, + getTenant(tenantId)); } void ApiWorkload::clearData(TTaskFct cont) { @@ -175,20 +204,51 @@ void ApiWorkload::clearData(TTaskFct cont) { [this, cont]() { schedule(cont); }); } -void ApiWorkload::populateData(TTaskFct cont) { - if (store.size() < initialSize) { - populateDataTx([this, cont]() { populateData(cont); }); - } else { +void ApiWorkload::populateTenantData(TTaskFct cont, std::optional tenantId) { + while (stores[tenantId].size() >= initialSize && tenantId && tenantId.value() < tenants.size()) { + ++tenantId.value(); + } + + if (tenantId >= tenants.size() || stores[tenantId].size() >= initialSize) { info("Data population completed"); schedule(cont); + } else { + populateDataTx([this, cont, tenantId]() { populateTenantData(cont, tenantId); }, tenantId); } } -void ApiWorkload::randomInsertOp(TTaskFct cont) { +void ApiWorkload::createTenants(TTaskFct cont) { + execTransaction( + [this](auto ctx) { + auto futures = std::make_shared>(); + for (auto tenant : tenants) { + futures->push_back(fdb::Tenant::getTenant(ctx->tx(), tenant)); + } + ctx->continueAfterAll(*futures, [this, ctx, futures]() { + for (int i = 0; i < futures->size(); ++i) { + if (!(*futures)[i].get()) { + fdb::Tenant::createTenant(ctx->tx(), tenants[i]); + } + } + ctx->commit(); + }); + }, + [this, cont]() { schedule(cont); }); +} + +void ApiWorkload::populateData(TTaskFct cont) { + if (tenants.size() > 0) { + createTenants([this, cont]() { populateTenantData(cont, std::make_optional(0)); }); + } else { + populateTenantData(cont, {}); + } +} + +void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(), randomValue() }); + kvPairs->push_back(fdb::KeyValue{ randomNotExistingKey(tenantId), randomValue() }); } execTransaction( [kvPairs](auto ctx) { @@ -197,19 +257,20 @@ void ApiWorkload::randomInsertOp(TTaskFct cont) { } ctx->commit(); }, - [this, kvPairs, cont]() { + [this, kvPairs, cont, tenantId]() { for (const fdb::KeyValue& kv : *kvPairs) { - store.set(kv.key, kv.value); + stores[tenantId].set(kv.key, kv.value); } schedule(cont); - }); + }, + getTenant(tenantId)); } -void ApiWorkload::randomClearOp(TTaskFct cont) { +void ApiWorkload::randomClearOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomExistingKey()); + keys->push_back(randomExistingKey(tenantId)); } execTransaction( [keys](auto ctx) { @@ -218,15 +279,16 @@ void ApiWorkload::randomClearOp(TTaskFct cont) { } ctx->commit(); }, - [this, keys, cont]() { + [this, keys, cont, tenantId]() { for (const auto& key : *keys) { - store.clear(key); + stores[tenantId].clear(key); } schedule(cont); - }); + }, + getTenant(tenantId)); } -void ApiWorkload::randomClearRangeOp(TTaskFct cont) { +void ApiWorkload::randomClearRangeOp(TTaskFct cont, std::optional tenantId) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); if (begin > end) { @@ -237,10 +299,19 @@ void ApiWorkload::randomClearRangeOp(TTaskFct cont) { ctx->tx().clearRange(begin, end); ctx->commit(); }, - [this, begin, end, cont]() { - store.clear(begin, end); + [this, begin, end, cont, tenantId]() { + stores[tenantId].clear(begin, end); schedule(cont); - }); + }, + getTenant(tenantId)); +} + +std::optional ApiWorkload::getTenant(std::optional tenantId) { + if (tenantId) { + return tenants[*tenantId]; + } else { + return {}; + } } } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterApiWorkload.h b/bindings/c/test/apitester/TesterApiWorkload.h index fd3630ceee..a3a13e964d 100644 --- a/bindings/c/test/apitester/TesterApiWorkload.h +++ b/bindings/c/test/apitester/TesterApiWorkload.h @@ -96,17 +96,23 @@ protected: // Key prefix fdb::Key keyPrefix; + // The number of tenants to configure in the cluster + std::vector tenants; + // In-memory store maintaining expected database state - KeyValueStore store; + std::unordered_map, KeyValueStore> stores; ApiWorkload(const WorkloadConfig& config); // Methods for generating random keys and values fdb::Key randomKeyName(); fdb::Value randomValue(); - fdb::Key randomNotExistingKey(); - fdb::Key randomExistingKey(); - fdb::Key randomKey(double existingKeyRatio); + fdb::Key randomNotExistingKey(std::optional tenantId); + fdb::Key randomExistingKey(std::optional tenantId); + fdb::Key randomKey(double existingKeyRatio, std::optional tenantId); + + // Chooses a random tenant from the available tenants (or an empty optional if tenants aren't used in the test) + std::optional randomTenant(); // Generate initial random data for the workload void populateData(TTaskFct cont); @@ -115,12 +121,18 @@ protected: void clearData(TTaskFct cont); // common operations - void randomInsertOp(TTaskFct cont); - void randomClearOp(TTaskFct cont); - void randomClearRangeOp(TTaskFct cont); + void randomInsertOp(TTaskFct cont, std::optional tenantId); + void randomClearOp(TTaskFct cont, std::optional tenantId); + void randomClearRangeOp(TTaskFct cont, std::optional tenantId); + + std::optional getTenant(std::optional tenantId); private: - void populateDataTx(TTaskFct cont); + void populateDataTx(TTaskFct cont, std::optional tenantId); + void populateTenantData(TTaskFct cont, std::optional tenantId); + void createTenants(TTaskFct cont); + + void clearTenantData(TTaskFct cont, std::optional tenantId); void randomOperations(); }; diff --git a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp index 52d8ddc651..f6164296da 100644 --- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp @@ -18,61 +18,13 @@ * limitations under the License. */ #include "TesterApiWorkload.h" +#include "TesterBlobGranuleUtil.h" #include "TesterUtil.h" #include #include namespace FdbApiTester { -class TesterGranuleContext { -public: - std::unordered_map loadsInProgress; - int64_t nextId = 0; - std::string basePath; - - ~TesterGranuleContext() { - // if there was an error or not all loads finished, delete data - for (auto& it : loadsInProgress) { - uint8_t* dataToFree = it.second; - delete[] dataToFree; - } - } -}; - -static int64_t granule_start_load(const char* filename, - int filenameLength, - int64_t offset, - int64_t length, - int64_t fullFileLength, - void* context) { - - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - int64_t loadId = ctx->nextId++; - - uint8_t* buffer = new uint8_t[length]; - std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary); - fin.seekg(offset); - fin.read((char*)buffer, length); - - ctx->loadsInProgress.insert({ loadId, buffer }); - - return loadId; -} - -static uint8_t* granule_get_load(int64_t loadId, void* context) { - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - return ctx->loadsInProgress.at(loadId); -} - -static void granule_free_load(int64_t loadId, void* context) { - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - auto it = ctx->loadsInProgress.find(loadId); - uint8_t* dataToFree = it->second; - delete[] dataToFree; - - ctx->loadsInProgress.erase(it); -} - class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload { public: ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) { @@ -83,34 +35,39 @@ public: } private: - enum OpType { OP_INSERT, OP_CLEAR, OP_CLEAR_RANGE, OP_READ, OP_GET_RANGES, OP_LAST = OP_GET_RANGES }; + // FIXME: use other new blob granule apis! + enum OpType { + OP_INSERT, + OP_CLEAR, + OP_CLEAR_RANGE, + OP_READ, + OP_GET_GRANULES, + OP_SUMMARIZE, + OP_GET_BLOB_RANGES, + OP_VERIFY, + OP_LAST = OP_VERIFY + }; std::vector excludedOpTypes; // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet // FIXME: should still guarantee a read succeeds eventually somehow bool seenReadSuccess = false; - void randomReadOp(TTaskFct cont) { + void randomReadOp(TTaskFct cont, std::optional tenantId) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); - auto results = std::make_shared>(); - auto tooOld = std::make_shared(false); if (begin > end) { std::swap(begin, end); } + + auto results = std::make_shared>(); + auto tooOld = std::make_shared(false); + execTransaction( [this, begin, end, results, tooOld](auto ctx) { ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); - TesterGranuleContext testerContext; - testerContext.basePath = ctx->getBGBasePath(); - - fdb::native::FDBReadBlobGranuleContext granuleContext; - granuleContext.userContext = &testerContext; - granuleContext.debugNoMaterialize = false; - granuleContext.granuleParallelism = 1; - granuleContext.start_load_f = &granule_start_load; - granuleContext.get_load_f = &granule_get_load; - granuleContext.free_load_f = &granule_free_load; + TesterGranuleContext testerContext(ctx->getBGBasePath()); + fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext); fdb::Result res = ctx->tx().readBlobGranules( begin, end, 0 /* beginVersion */, -2 /* latest read version */, granuleContext); @@ -124,8 +81,10 @@ private: } else if (err.code() != error_code_success) { ctx->onError(err); } else { - auto& [out_kv, out_count, out_more] = out; + auto resCopy = copyKeyValueArray(out); + auto& [resVector, out_more] = resCopy; ASSERT(!out_more); + results.get()->assign(resVector.begin(), resVector.end()); if (!seenReadSuccess) { info("BlobGranuleCorrectness::randomReadOp first success\n"); } @@ -133,9 +92,10 @@ private: ctx->done(); } }, - [this, begin, end, results, tooOld, cont]() { + [this, begin, end, results, tooOld, cont, tenantId]() { if (!*tooOld) { - std::vector expected = store.getRange(begin, end, store.size(), false); + std::vector expected = + stores[tenantId].getRange(begin, end, stores[tenantId].size(), false); if (results->size() != expected.size()) { error(fmt::format("randomReadOp result size mismatch. expected: {} actual: {}", expected.size(), @@ -166,19 +126,21 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } - void randomGetRangesOp(TTaskFct cont) { + void randomGetGranulesOp(TTaskFct cont, std::optional tenantId) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); - auto results = std::make_shared>(); if (begin > end) { std::swap(begin, end); } + auto results = std::make_shared>(); + execTransaction( [begin, end, results](auto ctx) { - fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end).eraseType(); + fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType(); ctx->continueAfter( f, [ctx, f, results]() { @@ -188,46 +150,180 @@ private: true); }, [this, begin, end, results, cont]() { - if (seenReadSuccess) { - ASSERT(results->size() > 0); - ASSERT(results->front().beginKey <= begin); - ASSERT(results->back().endKey >= end); - } + this->validateRanges(results, begin, end, seenReadSuccess); + schedule(cont); + }, + getTenant(tenantId)); + } + + void randomSummarizeOp(TTaskFct cont, std::optional tenantId) { + if (!seenReadSuccess) { + // tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a + // read success + schedule(cont); + return; + } + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + auto results = std::make_shared>(); + execTransaction( + [begin, end, results](auto ctx) { + fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType(); + ctx->continueAfter( + f, + [ctx, f, results]() { + *results = copyGranuleSummaryArray(f.get()); + ctx->done(); + }, + true); + }, + [this, begin, end, results, cont]() { + ASSERT(results->size() > 0); + ASSERT(results->front().keyRange.beginKey <= begin); + ASSERT(results->back().keyRange.endKey >= end); for (int i = 0; i < results->size(); i++) { - // no empty or inverted ranges - ASSERT((*results)[i].beginKey < (*results)[i].endKey); + // TODO: could do validation of subsequent calls and ensure snapshot version never decreases + ASSERT((*results)[i].keyRange.beginKey < (*results)[i].keyRange.endKey); + ASSERT((*results)[i].snapshotVersion <= (*results)[i].deltaVersion); + ASSERT((*results)[i].snapshotSize > 0); + ASSERT((*results)[i].deltaSize >= 0); } for (int i = 1; i < results->size(); i++) { // ranges contain entire requested key range - ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey); + ASSERT((*results)[i].keyRange.beginKey == (*results)[i - 1].keyRange.endKey); } schedule(cont); - }); + }, + getTenant(tenantId)); + } + + void validateRanges(std::shared_ptr> results, + fdb::Key begin, + fdb::Key end, + bool shouldBeRanges) { + if (shouldBeRanges) { + ASSERT(results->size() > 0); + ASSERT(results->front().beginKey <= begin); + ASSERT(results->back().endKey >= end); + } + for (int i = 0; i < results->size(); i++) { + // no empty or inverted ranges + if ((*results)[i].beginKey >= (*results)[i].endKey) { + error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})", + fdb::toCharsRef((*results)[i].beginKey), + fdb::toCharsRef((*results)[i].endKey), + fdb::toCharsRef(begin), + fdb::toCharsRef(end))); + } + ASSERT((*results)[i].beginKey < (*results)[i].endKey); + } + + for (int i = 1; i < results->size(); i++) { + // ranges contain entire requested key range + if ((*results)[i].beginKey != (*results)[i].endKey) { + error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})", + fdb::toCharsRef((*results)[i].beginKey), + fdb::toCharsRef((*results)[i].endKey), + fdb::toCharsRef(begin), + fdb::toCharsRef(end))); + } + ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey); + } + } + + void randomGetBlobRangesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + auto results = std::make_shared>(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end, results](auto ctx) { + fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType(); + ctx->continueAfter(f, [ctx, f, results]() { + *results = copyKeyRangeArray(f.get()); + ctx->done(); + }); + }, + [this, begin, end, results, cont]() { + this->validateRanges(results, begin, end, seenReadSuccess); + schedule(cont); + }, + /* failOnError = */ false); + } + + void randomVerifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + + auto verifyVersion = std::make_shared(false); + // info("Verify op starting"); + + execOperation( + [begin, end, verifyVersion](auto ctx) { + fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); + ctx->continueAfter(f, [ctx, verifyVersion, f]() { + *verifyVersion = f.get(); + ctx->done(); + }); + }, + [this, begin, end, verifyVersion, cont]() { + if (*verifyVersion == -1) { + ASSERT(!seenReadSuccess); + } else { + if (!seenReadSuccess) { + info("BlobGranuleCorrectness::randomVerifyOp first success"); + } + seenReadSuccess = true; + } + // info(fmt::format("verify op done @ {}", *verifyVersion)); + schedule(cont); + }, + /* failOnError = */ false); } void randomOperation(TTaskFct cont) { - OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); + std::optional tenantId = randomTenant(); + + OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); while (std::count(excludedOpTypes.begin(), excludedOpTypes.end(), txType)) { txType = (OpType)Random::get().randomInt(0, OP_LAST); } + switch (txType) { case OP_INSERT: - randomInsertOp(cont); + randomInsertOp(cont, tenantId); break; case OP_CLEAR: - randomClearOp(cont); + randomClearOp(cont, tenantId); break; case OP_CLEAR_RANGE: - randomClearRangeOp(cont); + randomClearRangeOp(cont, tenantId); break; case OP_READ: - randomReadOp(cont); + randomReadOp(cont, tenantId); break; - case OP_GET_RANGES: - randomGetRangesOp(cont); + case OP_GET_GRANULES: + randomGetGranulesOp(cont, tenantId); + break; + case OP_SUMMARIZE: + randomSummarizeOp(cont, tenantId); + break; + case OP_GET_BLOB_RANGES: + randomGetBlobRangesOp(cont); + break; + case OP_VERIFY: + randomVerifyOp(cont); break; } } diff --git a/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp new file mode 100644 index 0000000000..b4bcaacdc6 --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp @@ -0,0 +1,316 @@ +/* + * TesterBlobGranuleErrorsWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "TesterApiWorkload.h" +#include "TesterBlobGranuleUtil.h" +#include "TesterUtil.h" +#include +#include + +namespace FdbApiTester { + +class BlobGranuleErrorsWorkload : public ApiWorkload { +public: + BlobGranuleErrorsWorkload(const WorkloadConfig& config) : ApiWorkload(config) {} + +private: + enum OpType { + OP_READ_NO_MATERIALIZE, + OP_READ_FILE_LOAD_ERROR, + OP_READ_TOO_OLD, + OP_PURGE_UNALIGNED, + OP_BLOBBIFY_UNALIGNED, + OP_UNBLOBBIFY_UNALIGNED, + OP_CANCEL_GET_GRANULES, + OP_CANCEL_GET_RANGES, + OP_CANCEL_VERIFY, + OP_CANCEL_SUMMARIZE, + OP_CANCEL_BLOBBIFY, + OP_CANCEL_UNBLOBBIFY, + OP_CANCEL_PURGE, + OP_LAST = OP_CANCEL_PURGE + }; + + // could add summarize too old and verify too old as ops if desired but those are lower value + + // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet + // FIXME: should still guarantee a read succeeds eventually somehow + bool seenReadSuccess = false; + + void doErrorOp(TTaskFct cont, + std::string basePathAddition, + bool doMaterialize, + int64_t readVersion, + fdb::native::fdb_error_t expectedError) { + fdb::Key begin = randomKeyName(); + fdb::Key end = begin; + // [K - K) empty range will succeed read because there is trivially nothing to do, so don't do it + while (end == begin) { + end = randomKeyName(); + } + if (begin > end) { + std::swap(begin, end); + } + + execTransaction( + [this, begin, end, basePathAddition, doMaterialize, readVersion, expectedError](auto ctx) { + ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); + + TesterGranuleContext testerContext(ctx->getBGBasePath() + basePathAddition); + fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext); + granuleContext.debugNoMaterialize = !doMaterialize; + + fdb::Result res = + ctx->tx().readBlobGranules(begin, end, 0 /* beginVersion */, readVersion, granuleContext); + auto out = fdb::Result::KeyValueRefArray{}; + fdb::Error err = res.getKeyValueArrayNothrow(out); + + if (err.code() == error_code_success) { + error(fmt::format("Operation succeeded in error test!")); + } + ASSERT(err.code() != error_code_success); + if (err.code() != expectedError) { + info(fmt::format("incorrect error. Expected {}, Got {}", expectedError, err.code())); + if (err.code() == error_code_blob_granule_transaction_too_old) { + ASSERT(!seenReadSuccess); + ctx->done(); + } else { + ctx->onError(err); + } + } else { + if (err.code() != error_code_blob_granule_transaction_too_old) { + seenReadSuccess = true; + } + ctx->done(); + } + }, + [this, cont]() { schedule(cont); }); + } + + void randomOpReadNoMaterialize(TTaskFct cont) { + // ensure setting noMaterialize flag produces blob_granule_not_materialized + doErrorOp(cont, "", false, -2 /*latest read version */, error_code_blob_granule_not_materialized); + } + + void randomOpReadFileLoadError(TTaskFct cont) { + // point to a file path that doesn't exist by adding an extra suffix + doErrorOp(cont, "extrapath/", true, -2 /*latest read version */, error_code_blob_granule_file_load_error); + } + + void randomOpReadTooOld(TTaskFct cont) { + // read at a version (1) that should predate granule data + doErrorOp(cont, "", true, 1, error_code_blob_granule_transaction_too_old); + } + + void randomPurgeUnalignedOp(TTaskFct cont) { + // blobbify/unblobbify need to be aligned to blob range boundaries, so this should always fail + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [this, begin, end](auto ctx) { + fdb::Future f = ctx->db().purgeBlobGranules(begin, end, -2, false).eraseType(); + ctx->continueAfter( + f, + [this, ctx, f]() { + info(fmt::format("unaligned purge got {}", f.error().code())); + ASSERT(f.error().code() == error_code_unsupported_operation); + ctx->done(); + }, + true); + }, + [this, cont]() { schedule(cont); }); + } + + void randomBlobbifyUnalignedOp(bool blobbify, TTaskFct cont) { + // blobbify/unblobbify need to be aligned to blob range boundaries, so this should always return false + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + auto success = std::make_shared(false); + execOperation( + [begin, end, blobbify, success](auto ctx) { + fdb::Future f = blobbify ? ctx->db().blobbifyRange(begin, end).eraseType() + : ctx->db().unblobbifyRange(begin, end).eraseType(); + ctx->continueAfter( + f, + [ctx, f, success]() { + *success = f.get(); + ctx->done(); + }, + true); + }, + [this, cont, success]() { + ASSERT(!(*success)); + schedule(cont); + }); + } + + void randomCancelGetGranulesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelGetRangesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelVerifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelSummarizeOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelBlobbifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().blobbifyRange(begin, end).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelUnblobbifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().unblobbifyRange(begin, end).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelPurgeOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().purgeBlobGranules(begin, end, -2, false).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomOperation(TTaskFct cont) override { + OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { + case OP_READ_NO_MATERIALIZE: + randomOpReadNoMaterialize(cont); + break; + case OP_READ_FILE_LOAD_ERROR: + randomOpReadFileLoadError(cont); + break; + case OP_READ_TOO_OLD: + randomOpReadTooOld(cont); + break; + case OP_PURGE_UNALIGNED: + // gets the correct error but it doesn't propagate properly in the test + // randomPurgeUnalignedOp(cont); + break; + case OP_BLOBBIFY_UNALIGNED: + randomBlobbifyUnalignedOp(true, cont); + break; + case OP_UNBLOBBIFY_UNALIGNED: + randomBlobbifyUnalignedOp(false, cont); + break; + case OP_CANCEL_GET_GRANULES: + randomCancelGetGranulesOp(cont); + break; + case OP_CANCEL_GET_RANGES: + randomCancelGetRangesOp(cont); + break; + case OP_CANCEL_VERIFY: + randomCancelVerifyOp(cont); + break; + case OP_CANCEL_SUMMARIZE: + randomCancelSummarizeOp(cont); + break; + case OP_CANCEL_BLOBBIFY: + randomCancelBlobbifyOp(cont); + break; + case OP_CANCEL_UNBLOBBIFY: + randomCancelUnblobbifyOp(cont); + break; + case OP_CANCEL_PURGE: + randomCancelPurgeOp(cont); + break; + } + } +}; + +WorkloadFactory BlobGranuleErrorsWorkloadFactory("BlobGranuleErrors"); + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp new file mode 100644 index 0000000000..a908a9c0bf --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp @@ -0,0 +1,80 @@ +/* + * TesterBlobGranuleUtil.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterBlobGranuleUtil.h" +#include "TesterUtil.h" +#include + +namespace FdbApiTester { + +// FIXME: avoid duplicating this between files! +static int64_t granule_start_load(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* context) { + + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + int64_t loadId = ctx->nextId++; + + uint8_t* buffer = new uint8_t[length]; + std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary); + if (fin.fail()) { + delete[] buffer; + buffer = nullptr; + } else { + fin.seekg(offset); + fin.read((char*)buffer, length); + } + + ctx->loadsInProgress.insert({ loadId, buffer }); + + return loadId; +} + +static uint8_t* granule_get_load(int64_t loadId, void* context) { + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + return ctx->loadsInProgress.at(loadId); +} + +static void granule_free_load(int64_t loadId, void* context) { + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + auto it = ctx->loadsInProgress.find(loadId); + uint8_t* dataToFree = it->second; + delete[] dataToFree; + + ctx->loadsInProgress.erase(it); +} + +fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext) { + fdb::native::FDBReadBlobGranuleContext granuleContext; + + granuleContext.userContext = (void*)testerContext; + granuleContext.debugNoMaterialize = false; + granuleContext.granuleParallelism = 1 + Random::get().randomInt(0, 3); + granuleContext.start_load_f = &granule_start_load; + granuleContext.get_load_f = &granule_get_load; + granuleContext.free_load_f = &granule_free_load; + + return granuleContext; +} + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.h b/bindings/c/test/apitester/TesterBlobGranuleUtil.h new file mode 100644 index 0000000000..7b4b0dba81 --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.h @@ -0,0 +1,49 @@ +/* + * TesterBlobGranuleUtil.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef APITESTER_BLOBGRANULE_UTIL_H +#define APITESTER_BLOBGRANULE_UTIL_H +#include "TesterUtil.h" +#include "test/fdb_api.hpp" +#include + +namespace FdbApiTester { + +class TesterGranuleContext { +public: + std::unordered_map loadsInProgress; + std::string basePath; + int64_t nextId; + + TesterGranuleContext(const std::string& basePath) : basePath(basePath), nextId(0) {} + + ~TesterGranuleContext() { + // this should now never happen with proper memory management + ASSERT(loadsInProgress.empty()); + } +}; + +fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext); + +} // namespace FdbApiTester + +#endif diff --git a/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp index b569cdb35f..b4cd205143 100644 --- a/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp +++ b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp @@ -31,11 +31,11 @@ private: enum OpType { OP_CANCEL_GET, OP_CANCEL_AFTER_FIRST_GET, OP_LAST = OP_CANCEL_AFTER_FIRST_GET }; // Start multiple concurrent gets and cancel the transaction - void randomCancelGetTx(TTaskFct cont) { + void randomCancelGetTx(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomKey(readExistingKeysRatio)); + keys->push_back(randomKey(readExistingKeysRatio, tenantId)); } execTransaction( [keys](auto ctx) { @@ -45,25 +45,26 @@ private: } ctx->done(); }, - [this, cont]() { schedule(cont); }); + [this, cont]() { schedule(cont); }, + getTenant(tenantId)); } // Start multiple concurrent gets and cancel the transaction after the first get returns - void randomCancelAfterFirstResTx(TTaskFct cont) { + void randomCancelAfterFirstResTx(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomKey(readExistingKeysRatio)); + keys->push_back(randomKey(readExistingKeysRatio, tenantId)); } execTransaction( - [this, keys](auto ctx) { + [this, keys, tenantId](auto ctx) { std::vector futures; for (const auto& key : *keys) { futures.push_back(ctx->tx().get(key, false).eraseType()); } for (int i = 0; i < keys->size(); i++) { fdb::Future f = futures[i]; - auto expectedVal = store.get((*keys)[i]); + auto expectedVal = stores[tenantId].get((*keys)[i]); ctx->continueAfter(f, [expectedVal, f, this, ctx]() { auto val = f.get(); if (expectedVal != val) { @@ -75,17 +76,20 @@ private: }); } }, - [this, cont]() { schedule(cont); }); + [this, cont]() { schedule(cont); }, + getTenant(tenantId)); } void randomOperation(TTaskFct cont) override { + std::optional tenantId = randomTenant(); OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { case OP_CANCEL_GET: - randomCancelGetTx(cont); + randomCancelGetTx(cont, tenantId); break; case OP_CANCEL_AFTER_FIRST_GET: - randomCancelAfterFirstResTx(cont); + randomCancelAfterFirstResTx(cont, tenantId); break; } } diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 9219bb7056..4486abdf97 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -41,11 +41,11 @@ private: OP_LAST = OP_COMMIT_READ }; - void randomCommitReadOp(TTaskFct cont) { + void randomCommitReadOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { - kvPairs->push_back(fdb::KeyValue{ randomKey(readExistingKeysRatio), randomValue() }); + kvPairs->push_back(fdb::KeyValue{ randomKey(readExistingKeysRatio, tenantId), randomValue() }); } execTransaction( [kvPairs](auto ctx) { @@ -54,9 +54,9 @@ private: } ctx->commit(); }, - [this, kvPairs, cont]() { + [this, kvPairs, cont, tenantId]() { for (const fdb::KeyValue& kv : *kvPairs) { - store.set(kv.key, kv.value); + stores[tenantId].set(kv.key, kv.value); } auto results = std::make_shared>>(); execTransaction( @@ -78,10 +78,10 @@ private: ctx->done(); }); }, - [this, kvPairs, results, cont]() { + [this, kvPairs, results, cont, tenantId]() { ASSERT(results->size() == kvPairs->size()); for (int i = 0; i < kvPairs->size(); i++) { - auto expected = store.get((*kvPairs)[i].key); + auto expected = stores[tenantId].get((*kvPairs)[i].key); auto actual = (*results)[i]; if (actual != expected) { error( @@ -93,16 +93,18 @@ private: } } schedule(cont); - }); - }); + }, + getTenant(tenantId)); + }, + getTenant(tenantId)); } - void randomGetOp(TTaskFct cont) { + void randomGetOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); auto results = std::make_shared>>(); for (int i = 0; i < numKeys; i++) { - keys->push_back(randomKey(readExistingKeysRatio)); + keys->push_back(randomKey(readExistingKeysRatio, tenantId)); } execTransaction( [keys, results](auto ctx) { @@ -119,10 +121,10 @@ private: ctx->done(); }); }, - [this, keys, results, cont]() { + [this, keys, results, cont, tenantId]() { ASSERT(results->size() == keys->size()); for (int i = 0; i < keys->size(); i++) { - auto expected = store.get((*keys)[i]); + auto expected = stores[tenantId].get((*keys)[i]); if ((*results)[i] != expected) { error(fmt::format("randomGetOp mismatch. key: {} expected: {:.80} actual: {:.80}", fdb::toCharsRef((*keys)[i]), @@ -131,16 +133,17 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } - void randomGetKeyOp(TTaskFct cont) { + void randomGetKeyOp(TTaskFct cont, std::optional tenantId) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keysWithSelectors = std::make_shared>>(); auto results = std::make_shared>(); keysWithSelectors->reserve(numKeys); for (int i = 0; i < numKeys; i++) { - auto key = randomKey(readExistingKeysRatio); + auto key = randomKey(readExistingKeysRatio, tenantId); fdb::KeySelector selector; selector.keyLength = key.size(); selector.orEqual = Random::get().randomBool(0.5); @@ -169,20 +172,20 @@ private: ctx->done(); }); }, - [this, keysWithSelectors, results, cont]() { + [this, keysWithSelectors, results, cont, tenantId]() { ASSERT(results->size() == keysWithSelectors->size()); for (int i = 0; i < keysWithSelectors->size(); i++) { auto const& key = (*keysWithSelectors)[i].first; auto const& selector = (*keysWithSelectors)[i].second; - auto expected = store.getKey(key, selector.orEqual, selector.offset); + auto expected = stores[tenantId].getKey(key, selector.orEqual, selector.offset); auto actual = (*results)[i]; // Local store only contains data for the current client, while fdb contains data from multiple // clients. If getKey returned a key outside of the range for the current client, adjust the result // to match what would be expected in the local store. if (actual.substr(0, keyPrefix.size()) < keyPrefix) { - actual = store.startKey(); + actual = stores[tenantId].startKey(); } else if ((*results)[i].substr(0, keyPrefix.size()) > keyPrefix) { - actual = store.endKey(); + actual = stores[tenantId].endKey(); } if (actual != expected) { error(fmt::format("randomGetKeyOp mismatch. key: {}, orEqual: {}, offset: {}, expected: {} " @@ -195,37 +198,38 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } void getRangeLoop(std::shared_ptr ctx, fdb::KeySelector begin, - fdb::KeySelector end, + fdb::Key endKey, std::shared_ptr> results) { auto f = ctx->tx().getRange(begin, - end, + fdb::key_select::firstGreaterOrEqual(endKey), 0 /*limit*/, 0 /*target_bytes*/, FDB_STREAMING_MODE_WANT_ALL, 0 /*iteration*/, false /*snapshot*/, false /*reverse*/); - ctx->continueAfter(f, [this, ctx, f, end, results]() { + ctx->continueAfter(f, [this, ctx, f, endKey, results]() { auto out = copyKeyValueArray(f.get()); results->insert(results->end(), out.first.begin(), out.first.end()); const bool more = out.second; if (more) { // Fetch the remaining results. - getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), end, results); + getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), endKey, results); } else { ctx->done(); } }); } - void randomGetRangeOp(TTaskFct cont) { - auto begin = randomKey(readExistingKeysRatio); - auto end = randomKey(readExistingKeysRatio); + void randomGetRangeOp(TTaskFct cont, std::optional tenantId) { + auto begin = randomKey(readExistingKeysRatio, tenantId); + auto end = randomKey(readExistingKeysRatio, tenantId); auto results = std::make_shared>(); execTransaction( @@ -233,13 +237,10 @@ private: // Clear the results vector, in case the transaction is retried. results->clear(); - getRangeLoop(ctx, - fdb::key_select::firstGreaterOrEqual(begin), - fdb::key_select::firstGreaterOrEqual(end), - results); + getRangeLoop(ctx, fdb::key_select::firstGreaterOrEqual(begin), end, results); }, - [this, begin, end, results, cont]() { - auto expected = store.getRange(begin, end, results->size() + 10, false); + [this, begin, end, results, cont, tenantId]() { + auto expected = stores[tenantId].getRange(begin, end, results->size() + 10, false); if (results->size() != expected.size()) { error(fmt::format("randomGetRangeOp mismatch. expected {} keys, actual {} keys", expected.size(), @@ -260,32 +261,35 @@ private: } } schedule(cont); - }); + }, + getTenant(tenantId)); } void randomOperation(TTaskFct cont) { - OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); + std::optional tenantId = randomTenant(); + OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { case OP_INSERT: - randomInsertOp(cont); + randomInsertOp(cont, tenantId); break; case OP_GET: - randomGetOp(cont); + randomGetOp(cont, tenantId); break; case OP_GET_KEY: - randomGetKeyOp(cont); + randomGetKeyOp(cont, tenantId); break; case OP_CLEAR: - randomClearOp(cont); + randomClearOp(cont, tenantId); break; case OP_GET_RANGE: - randomGetRangeOp(cont); + randomGetRangeOp(cont, tenantId); break; case OP_CLEAR_RANGE: - randomClearRangeOp(cont); + randomClearRangeOp(cont, tenantId); break; case OP_COMMIT_READ: - randomCommitReadOp(cont); + randomCommitReadOp(cont, tenantId); break; } } diff --git a/bindings/c/test/apitester/TesterExampleWorkload.cpp b/bindings/c/test/apitester/TesterExampleWorkload.cpp new file mode 100644 index 0000000000..882fdc62e4 --- /dev/null +++ b/bindings/c/test/apitester/TesterExampleWorkload.cpp @@ -0,0 +1,65 @@ +/* + * TesterExampleWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterWorkload.h" +#include "TesterUtil.h" + +namespace FdbApiTester { + +class SetAndGetWorkload : public WorkloadBase { +public: + fdb::Key keyPrefix; + Random random; + + SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) { + keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId)); + } + + void start() override { setAndGet(NO_OP_TASK); } + + void setAndGet(TTaskFct cont) { + fdb::Key key = keyPrefix + random.randomByteStringLowerCase(10, 100); + fdb::Value value = random.randomByteStringLowerCase(10, 1000); + execTransaction( + [key, value](auto ctx) { + ctx->tx().set(key, value); + ctx->commit(); + }, + [this, key, value, cont]() { + execTransaction( + [this, key, value](auto ctx) { + auto future = ctx->tx().get(key, false); + ctx->continueAfter(future, [this, ctx, future, value]() { + std::optional res = copyValueRef(future.get()); + if (res != value) { + error(fmt::format( + "expected: {} actual: {}", fdb::toCharsRef(value), fdb::toCharsRef(res.value()))); + } + ctx->done(); + }); + }, + cont); + }); + } +}; + +WorkloadFactory SetAndGetWorkloadFactory("SetAndGet"); + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 3ff57ec183..7c7d0fc948 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -38,6 +38,7 @@ public: std::string logGroup; std::string externalClientLibrary; std::string externalClientDir; + std::string futureVersionClientLibrary; std::string tmpDir; bool disableLocalClient = false; std::string testFile; @@ -48,6 +49,7 @@ public: int numClientThreads; int numDatabases; int numClients; + int numTenants = -1; int statsIntervalMs = 0; std::vector> knobs; TestSpec testSpec; diff --git a/bindings/c/test/apitester/TesterTestSpec.cpp b/bindings/c/test/apitester/TesterTestSpec.cpp index 86a89c9116..1048aab493 100644 --- a/bindings/c/test/apitester/TesterTestSpec.cpp +++ b/bindings/c/test/apitester/TesterTestSpec.cpp @@ -65,6 +65,10 @@ std::unordered_mapdatabasePerTransaction = (value == "true"); } }, + { "tamperClusterFile", + [](const std::string& value, TestSpec* spec) { // + spec->tamperClusterFile = (value == "true"); + } }, { "minFdbThreads", [](const std::string& value, TestSpec* spec) { // processIntOption(value, "minFdbThreads", spec->minFdbThreads, 1, 1000); @@ -96,6 +100,18 @@ std::unordered_mapmaxClients, 1, 1000); + } }, + { "disableClientBypass", + [](const std::string& value, TestSpec* spec) { // + spec->disableClientBypass = (value == "true"); + } }, + { "minTenants", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "minTenants", spec->minTenants, 1, 1000); + } }, + { "maxTenants", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "maxTenants", spec->maxTenants, 1, 1000); } } }; diff --git a/bindings/c/test/apitester/TesterTestSpec.h b/bindings/c/test/apitester/TesterTestSpec.h index be7a573033..c0e9c0caf1 100644 --- a/bindings/c/test/apitester/TesterTestSpec.h +++ b/bindings/c/test/apitester/TesterTestSpec.h @@ -58,6 +58,9 @@ struct TestSpec { // Execute each transaction in a separate database instance bool databasePerTransaction = false; + // Test tampering the cluster file + bool tamperClusterFile = false; + // Size of the FDB client thread pool (a random number in the [min,max] range) int minFdbThreads = 1; int maxFdbThreads = 1; @@ -75,6 +78,13 @@ struct TestSpec { int minClients = 1; int maxClients = 10; + // Disable the ability to bypass the MVC API, for + // cases when there are no external clients + bool disableClientBypass = false; + // Number of tenants (a random number in the [min,max] range) + int minTenants = 0; + int maxTenants = 0; + // List of workloads with their options std::vector workloads; }; diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 221774854d..27acef0e14 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -23,25 +23,23 @@ #include "foundationdb/fdb_c_types.h" #include "test/apitester/TesterScheduler.h" #include "test/fdb_api.hpp" +#include #include #include +#include #include #include #include #include #include #include +#include namespace FdbApiTester { constexpr int LONG_WAIT_TIME_US = 2000000; constexpr int LARGE_NUMBER_OF_RETRIES = 10; -void TransactionActorBase::complete(fdb::Error err) { - error = err; - context = {}; -} - void ITransactionContext::continueAfterAll(std::vector futures, TTaskFct cont) { auto counter = std::make_shared>(futures.size()); auto errorCode = std::make_shared>(fdb::Error::success()); @@ -72,20 +70,44 @@ void ITransactionContext::continueAfterAll(std::vector futures, TTa */ class TransactionContextBase : public ITransactionContext { public: - TransactionContextBase(fdb::Transaction tx, - std::shared_ptr txActor, - TTaskFct cont, + TransactionContextBase(ITransactionExecutor* executor, + TOpStartFct startFct, + TOpContFct cont, IScheduler* scheduler, int retryLimit, - std::string bgBasePath) - : fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit), - txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath) {} + std::string bgBasePath, + std::optional tenantName, + bool transactional) + : executor(executor), startFct(startFct), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit), + txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath), tenantName(tenantName), + transactional(transactional) { + databaseCreateErrorInjected = executor->getOptions().injectDatabaseCreateErrors && + Random::get().randomBool(executor->getOptions().databaseCreateErrorRatio); + if (databaseCreateErrorInjected) { + fdbDb = fdb::Database(executor->getClusterFileForErrorInjection()); + } else { + fdbDb = executor->selectDatabase(); + } + + if (transactional) { + if (tenantName) { + fdb::Tenant tenant = fdbDb.openTenant(*tenantName); + fdbTx = tenant.createTransaction(); + } else { + fdbTx = fdbDb.createTransaction(); + } + } + } + + virtual ~TransactionContextBase() { ASSERT(txState == TxState::DONE); } // A state machine: // IN_PROGRESS -> (ON_ERROR -> IN_PROGRESS)* [-> ON_ERROR] -> DONE enum class TxState { IN_PROGRESS, ON_ERROR, DONE }; - fdb::Transaction tx() override { return fdbTx; } + fdb::Database db() override { return fdbDb.atomic_load(); } + + fdb::Transaction tx() override { return fdbTx.atomic_load(); } // Set a continuation to be executed when a future gets ready void continueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { @@ -94,6 +116,7 @@ public: // Complete the transaction with a commit void commit() override { + ASSERT(transactional); std::unique_lock lock(mutex); if (txState != TxState::IN_PROGRESS) { return; @@ -114,31 +137,90 @@ public: } txState = TxState::DONE; lock.unlock(); + + // No need for lock from here on, because only one thread + // can enter DONE state and handle it + if (retriedErrors.size() >= LARGE_NUMBER_OF_RETRIES) { fmt::print("Transaction succeeded after {} retries on errors: {}\n", retriedErrors.size(), fmt::join(retriedErrorCodes(), ", ")); } - // cancel transaction so that any pending operations on it - // fail gracefully - fdbTx.cancel(); - txActor->complete(fdb::Error::success()); + + if (transactional) { + // cancel transaction so that any pending operations on it + // fail gracefully + fdbTx.cancel(); + } cleanUp(); - contAfterDone(); + ASSERT(txState == TxState::DONE); + contAfterDone(fdb::Error::success()); } std::string getBGBasePath() override { return bgBasePath; } + virtual void onError(fdb::Error err) override { + std::unique_lock lock(mutex); + if (txState != TxState::IN_PROGRESS) { + // Ignore further errors, if the transaction is in the error handing mode or completed + return; + } + txState = TxState::ON_ERROR; + lock.unlock(); + + // No need to hold the lock from here on, because ON_ERROR state is handled sequentially, and + // other callbacks are simply ignored while it stays in this state + + if (!canRetry(err)) { + return; + } + + ASSERT(!onErrorFuture); + + if (databaseCreateErrorInjected && canBeInjectedDatabaseCreateError(err.code())) { + // Failed to create a database because of failure injection + // Restart by recreating the transaction in a valid database + auto thisRef = std::static_pointer_cast(shared_from_this()); + scheduler->schedule([thisRef]() { + fdb::Database db = thisRef->executor->selectDatabase(); + thisRef->fdbDb.atomic_store(db); + if (thisRef->transactional) { + if (thisRef->tenantName) { + fdb::Tenant tenant = db.openTenant(*thisRef->tenantName); + thisRef->fdbTx.atomic_store(tenant.createTransaction()); + } else { + thisRef->fdbTx.atomic_store(db.createTransaction()); + } + } + thisRef->restartTransaction(); + }); + } else if (transactional) { + onErrorArg = err; + onErrorFuture = tx().onError(err); + handleOnErrorFuture(); + } else { + transactionFailed(err); + } + } + protected: virtual void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) = 0; + virtual void handleOnErrorFuture() = 0; + // Clean up transaction state after completing the transaction // Note that the object may live longer, because it is referenced // by not yet triggered callbacks - virtual void cleanUp() { + void cleanUp() { ASSERT(txState == TxState::DONE); ASSERT(!onErrorFuture); - txActor = {}; + cancelPendingFutures(); + } + + virtual void cancelPendingFutures() {} + + bool canBeInjectedDatabaseCreateError(fdb::Error::CodeType errCode) { + return errCode == error_code_no_cluster_file_found || errCode == error_code_connection_string_invalid; } // Complete the transaction with an (unretriable) error @@ -150,9 +232,12 @@ protected: } txState = TxState::DONE; lock.unlock(); - txActor->complete(err); + + // No need for lock from here on, because only one thread + // can enter DONE state and handle it + cleanUp(); - contAfterDone(); + contAfterDone(err); } // Handle result of an a transaction onError call @@ -163,14 +248,20 @@ protected: if (err) { transactionFailed(err); } else { - std::unique_lock lock(mutex); - txState = TxState::IN_PROGRESS; - commitCalled = false; - lock.unlock(); - txActor->start(); + restartTransaction(); } } + void restartTransaction() { + ASSERT(txState == TxState::ON_ERROR); + cancelPendingFutures(); + std::unique_lock lock(mutex); + txState = TxState::IN_PROGRESS; + commitCalled = false; + lock.unlock(); + startFct(shared_from_this()); + } + // Checks if a transaction can be retried. Fails the transaction if the check fails bool canRetry(fdb::Error lastErr) { ASSERT(txState == TxState::ON_ERROR); @@ -196,44 +287,77 @@ protected: return retriedErrorCodes; } + // Pointer to the transaction executor interface + // Set in contructor, stays immutable + ITransactionExecutor* const executor; + + // FDB database + // Provides a thread safe interface by itself (no need for mutex) + fdb::Database fdbDb; + // FDB transaction + // Provides a thread safe interface by itself (no need for mutex) fdb::Transaction fdbTx; - // Actor implementing the transaction worklflow - std::shared_ptr txActor; + // The function implementing the starting point of the transaction + // Set in constructor and reset on cleanup (no need for mutex) + TOpStartFct startFct; // Mutex protecting access to shared mutable state + // Only the state that is accessible unter IN_PROGRESS state + // must be protected by mutex std::mutex mutex; // Continuation to be called after completion of the transaction - TTaskFct contAfterDone; + // Set in contructor, stays immutable + const TOpContFct contAfterDone; // Reference to the scheduler - IScheduler* scheduler; + // Set in contructor, stays immutable + // Cannot be accessed in DONE state, workloads can be completed and the scheduler deleted + IScheduler* const scheduler; // Retry limit - int retryLimit; + // Set in contructor, stays immutable + const int retryLimit; // Transaction execution state + // Must be accessed under mutex TxState txState; - // onError future used in ON_ERROR state + // onError future + // used only in ON_ERROR state (no need for mutex) fdb::Future onErrorFuture; // The error code on which onError was called + // used only in ON_ERROR state (no need for mutex) fdb::Error onErrorArg; // The time point of calling onError + // used only in ON_ERROR state (no need for mutex) TimePoint onErrorCallTimePoint; // Transaction is committed or being committed + // Must be accessed under mutex bool commitCalled; // A history of errors on which the transaction was retried + // used only in ON_ERROR and DONE states (no need for mutex) std::vector retriedErrors; // blob granule base path - std::string bgBasePath; + // Set in contructor, stays immutable + const std::string bgBasePath; + + // Indicates if the database error was injected + // Accessed on initialization and in ON_ERROR state only (no need for mutex) + bool databaseCreateErrorInjected; + + // The tenant that we will run this transaction in + const std::optional tenantName; + + // Specifies whether the operation is transactional + const bool transactional; }; /** @@ -241,13 +365,16 @@ protected: */ class BlockingTransactionContext : public TransactionContextBase { public: - BlockingTransactionContext(fdb::Transaction tx, - std::shared_ptr txActor, - TTaskFct cont, + BlockingTransactionContext(ITransactionExecutor* executor, + TOpStartFct startFct, + TOpContFct cont, IScheduler* scheduler, int retryLimit, - std::string bgBasePath) - : TransactionContextBase(tx, txActor, cont, scheduler, retryLimit, bgBasePath) {} + std::string bgBasePath, + std::optional tenantName, + bool transactional) + : TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) { + } protected: void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { @@ -288,22 +415,8 @@ protected: onError(err); } - virtual void onError(fdb::Error err) override { - std::unique_lock lock(mutex); - if (txState != TxState::IN_PROGRESS) { - // Ignore further errors, if the transaction is in the error handing mode or completed - return; - } - txState = TxState::ON_ERROR; - lock.unlock(); - - if (!canRetry(err)) { - return; - } - - ASSERT(!onErrorFuture); - onErrorFuture = fdbTx.onError(err); - onErrorArg = err; + virtual void handleOnErrorFuture() override { + ASSERT(txState == TxState::ON_ERROR); auto start = timeNow(); fdb::Error err2 = onErrorFuture.blockUntilReady(); @@ -330,13 +443,16 @@ protected: */ class AsyncTransactionContext : public TransactionContextBase { public: - AsyncTransactionContext(fdb::Transaction tx, - std::shared_ptr txActor, - TTaskFct cont, + AsyncTransactionContext(ITransactionExecutor* executor, + TOpStartFct startFct, + TOpContFct cont, IScheduler* scheduler, int retryLimit, - std::string bgBasePath) - : TransactionContextBase(tx, txActor, cont, scheduler, retryLimit, bgBasePath) {} + std::string bgBasePath, + std::optional tenantName, + bool transactional) + : TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) { + } protected: void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { @@ -344,7 +460,7 @@ protected: if (txState != TxState::IN_PROGRESS) { return; } - callbackMap[f] = CallbackInfo{ f, cont, shared_from_this(), retryOnError, timeNow() }; + callbackMap[f] = CallbackInfo{ f, cont, shared_from_this(), retryOnError, timeNow(), false }; lock.unlock(); try { f.then([this](fdb::Future f) { futureReadyCallback(f, this); }); @@ -383,7 +499,6 @@ protected: if (txState != TxState::IN_PROGRESS) { return; } - lock.unlock(); fdb::Error err = f.error(); auto waitTimeUs = timeElapsedInUs(cbInfo.startTime, endTime); if (waitTimeUs > LONG_WAIT_TIME_US) { @@ -392,32 +507,23 @@ protected: err.code(), err.what()); } - if (err.code() == error_code_transaction_cancelled) { + if (err.code() == error_code_transaction_cancelled || cbInfo.cancelled) { return; } if (err.code() == error_code_success || !cbInfo.retryOnError) { scheduler->schedule(cbInfo.cont); return; } + // We keep lock until here to prevent transitions from the IN_PROGRESS state + // which could possibly lead to completion of the workload and destruction + // of the scheduler + lock.unlock(); onError(err); } - virtual void onError(fdb::Error err) override { - std::unique_lock lock(mutex); - if (txState != TxState::IN_PROGRESS) { - // Ignore further errors, if the transaction is in the error handing mode or completed - return; - } - txState = TxState::ON_ERROR; - lock.unlock(); + virtual void handleOnErrorFuture() override { + ASSERT(txState == TxState::ON_ERROR); - if (!canRetry(err)) { - return; - } - - ASSERT(!onErrorFuture); - onErrorArg = err; - onErrorFuture = tx().onError(err); onErrorCallTimePoint = timeNow(); onErrorThisRef = std::static_pointer_cast(shared_from_this()); try { @@ -457,17 +563,17 @@ protected: scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); }); } - void cleanUp() override { - TransactionContextBase::cleanUp(); - + void cancelPendingFutures() override { // Cancel all pending operations // Note that the callbacks of the cancelled futures will still be called std::unique_lock lock(mutex); std::vector futures; for (auto& iter : callbackMap) { + iter.second.cancelled = true; futures.push_back(iter.second.future); } lock.unlock(); + for (auto& f : futures) { f.cancel(); } @@ -487,12 +593,16 @@ protected: std::shared_ptr thisRef; bool retryOnError; TimePoint startTime; + bool cancelled; }; // Map for keeping track of future waits and holding necessary object references + // It can be accessed at any time when callbacks are triggered, so it mus always + // be mutex protected std::unordered_map callbackMap; // Holding reference to this for onError future C callback + // Accessed only in ON_ERROR state (no need for mutex) std::shared_ptr onErrorThisRef; }; @@ -503,30 +613,86 @@ class TransactionExecutorBase : public ITransactionExecutor { public: TransactionExecutorBase(const TransactionExecutorOptions& options) : options(options), scheduler(nullptr) {} + ~TransactionExecutorBase() { + if (tamperClusterFileThread.joinable()) { + tamperClusterFileThread.join(); + } + } + void init(IScheduler* scheduler, const char* clusterFile, const std::string& bgBasePath) override { this->scheduler = scheduler; this->clusterFile = clusterFile; this->bgBasePath = bgBasePath; + + ASSERT(!options.tmpDir.empty()); + emptyClusterFile.create(options.tmpDir, "fdbempty.cluster"); + invalidClusterFile.create(options.tmpDir, "fdbinvalid.cluster"); + invalidClusterFile.write(Random().get().randomStringLowerCase(1, 100)); + + emptyListClusterFile.create(options.tmpDir, "fdbemptylist.cluster"); + emptyListClusterFile.write(fmt::format("{}:{}@", + Random().get().randomStringLowerCase(3, 8), + Random().get().randomStringLowerCase(1, 100))); + + if (options.tamperClusterFile) { + tamperedClusterFile.create(options.tmpDir, "fdb.cluster"); + originalClusterFile = clusterFile; + this->clusterFile = tamperedClusterFile.getFileName(); + + // begin with a valid cluster file, but with non existing address + tamperedClusterFile.write(fmt::format("{}:{}@192.168.{}.{}:{}", + Random().get().randomStringLowerCase(3, 8), + Random().get().randomStringLowerCase(1, 100), + Random().get().randomInt(1, 254), + Random().get().randomInt(1, 254), + Random().get().randomInt(2000, 10000))); + + tamperClusterFileThread = std::thread([this]() { + std::this_thread::sleep_for(std::chrono::seconds(2)); + // now write an invalid connection string + tamperedClusterFile.write(fmt::format("{}:{}@", + Random().get().randomStringLowerCase(3, 8), + Random().get().randomStringLowerCase(1, 100))); + std::this_thread::sleep_for(std::chrono::seconds(2)); + // finally use correct cluster file contents + std::filesystem::copy_file(std::filesystem::path(originalClusterFile), + std::filesystem::path(tamperedClusterFile.getFileName()), + std::filesystem::copy_options::overwrite_existing); + }); + } } -protected: - // Execute the transaction on the given database instance - void executeOnDatabase(fdb::Database db, std::shared_ptr txActor, TTaskFct cont) { + const TransactionExecutorOptions& getOptions() override { return options; } + + void execute(TOpStartFct startFct, + TOpContFct cont, + std::optional tenantName, + bool transactional) override { try { - fdb::Transaction tx = db.createTransaction(); std::shared_ptr ctx; if (options.blockOnFutures) { ctx = std::make_shared( - tx, txActor, cont, scheduler, options.transactionRetryLimit, bgBasePath); + this, startFct, cont, scheduler, options.transactionRetryLimit, bgBasePath, tenantName, true); } else { ctx = std::make_shared( - tx, txActor, cont, scheduler, options.transactionRetryLimit, bgBasePath); + this, startFct, cont, scheduler, options.transactionRetryLimit, bgBasePath, tenantName, true); } - txActor->init(ctx); - txActor->start(); + startFct(ctx); } catch (...) { - txActor->complete(fdb::Error(error_code_operation_failed)); - cont(); + cont(fdb::Error(error_code_operation_failed)); + } + } + + std::string getClusterFileForErrorInjection() override { + switch (Random::get().randomInt(0, 3)) { + case 0: + return fmt::format("{}{}", "not-existing-file", Random::get().randomStringLowerCase(0, 2)); + case 1: + return emptyClusterFile.getFileName(); + case 2: + return invalidClusterFile.getFileName(); + default: // case 3 + return emptyListClusterFile.getFileName(); } } @@ -535,6 +701,12 @@ protected: std::string bgBasePath; std::string clusterFile; IScheduler* scheduler; + TmpFile emptyClusterFile; + TmpFile invalidClusterFile; + TmpFile emptyListClusterFile; + TmpFile tamperedClusterFile; + std::thread tamperClusterFileThread; + std::string originalClusterFile; }; /** @@ -549,19 +721,19 @@ public: void init(IScheduler* scheduler, const char* clusterFile, const std::string& bgBasePath) override { TransactionExecutorBase::init(scheduler, clusterFile, bgBasePath); for (int i = 0; i < options.numDatabases; i++) { - fdb::Database db(clusterFile); + fdb::Database db(this->clusterFile); databases.push_back(db); } } - void execute(std::shared_ptr txActor, TTaskFct cont) override { + fdb::Database selectDatabase() override { int idx = Random::get().randomInt(0, options.numDatabases - 1); - executeOnDatabase(databases[idx], txActor, cont); + return databases[idx]; } +private: void release() { databases.clear(); } -private: std::vector databases; }; @@ -572,10 +744,7 @@ class DBPerTransactionExecutor : public TransactionExecutorBase { public: DBPerTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {} - void execute(std::shared_ptr txActor, TTaskFct cont) override { - fdb::Database db(clusterFile.c_str()); - executeOnDatabase(db, txActor, cont); - } + fdb::Database selectDatabase() override { return fdb::Database(clusterFile.c_str()); } }; std::unique_ptr createTransactionExecutor(const TransactionExecutorOptions& options) { diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 31f6f3bc84..b0e5268d14 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -38,6 +38,9 @@ class ITransactionContext : public std::enable_shared_from_this futures, TTaskFct cont); }; -/** - * Interface of an actor object implementing a concrete transaction - */ -class ITransactionActor { -public: - virtual ~ITransactionActor() {} +// Type of the lambda functions implementing a database operation +using TOpStartFct = std::function)>; - // Initialize with the given transaction context - virtual void init(std::shared_ptr ctx) = 0; - - // Start execution of the transaction, also called on retries - virtual void start() = 0; - - // Transaction completion result (error_code_success in case of success) - virtual fdb::Error getError() = 0; - - // Notification about the completion of the transaction - virtual void complete(fdb::Error err) = 0; -}; - -/** - * A helper base class for transaction actors - */ -class TransactionActorBase : public ITransactionActor { -public: - void init(std::shared_ptr ctx) override { context = ctx; } - fdb::Error getError() override { return error; } - void complete(fdb::Error err) override; - -protected: - std::shared_ptr ctx() { return context; } - -private: - std::shared_ptr context; - fdb::Error error = fdb::Error::success(); -}; - -// Type of the lambda functions implementing a transaction -using TTxStartFct = std::function)>; - -/** - * A wrapper class for transactions implemented by lambda functions - */ -class TransactionFct : public TransactionActorBase { -public: - TransactionFct(TTxStartFct startFct) : startFct(startFct) {} - void start() override { startFct(this->ctx()); } - -private: - TTxStartFct startFct; -}; +// Type of the lambda functions implementing a database operation +using TOpContFct = std::function; /** * Configuration of transaction execution mode @@ -124,11 +81,27 @@ struct TransactionExecutorOptions { // Create each transaction in a separate database instance bool databasePerTransaction = false; + // Enable injection of database create errors + bool injectDatabaseCreateErrors = false; + + // Test tampering cluster file contents + bool tamperClusterFile = false; + + // The probability of injected database create errors + // Used if injectDatabaseCreateErrors = true + double databaseCreateErrorRatio = 0.1; + // The size of the database instance pool int numDatabases = 1; + // The number of tenants to create in the cluster. If 0, no tenants are used. + int numTenants = 0; + // Maximum number of retries per transaction (0 - unlimited) int transactionRetryLimit = 0; + + // Temporary directory + std::string tmpDir; }; /** @@ -140,7 +113,13 @@ class ITransactionExecutor { public: virtual ~ITransactionExecutor() {} virtual void init(IScheduler* sched, const char* clusterFile, const std::string& bgBasePath) = 0; - virtual void execute(std::shared_ptr tx, TTaskFct cont) = 0; + virtual void execute(TOpStartFct start, + TOpContFct cont, + std::optional tenantName, + bool transactional) = 0; + virtual fdb::Database selectDatabase() = 0; + virtual std::string getClusterFileForErrorInjection() = 0; + virtual const TransactionExecutorOptions& getOptions() = 0; }; // Create a transaction executor for the given options diff --git a/bindings/c/test/apitester/TesterUtil.cpp b/bindings/c/test/apitester/TesterUtil.cpp index 0e19081180..6ec9f76f04 100644 --- a/bindings/c/test/apitester/TesterUtil.cpp +++ b/bindings/c/test/apitester/TesterUtil.cpp @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include namespace FdbApiTester { @@ -46,16 +49,6 @@ Random& Random::get() { return random; } -fdb::ByteString Random::randomStringLowerCase(int minLength, int maxLength) { - int length = randomInt(minLength, maxLength); - fdb::ByteString str; - str.reserve(length); - for (int i = 0; i < length; i++) { - str += (char)randomInt('a', 'z'); - } - return str; -} - bool Random::randomBool(double trueRatio) { return std::uniform_real_distribution(0.0, 1.0)(random) <= trueRatio; } @@ -106,4 +99,52 @@ KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array) { return out; }; +GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefArray::Type array) { + auto& [in_summaries, in_count] = array; + + GranuleSummaryArray out; + + for (int i = 0; i < in_count; ++i) { + fdb::native::FDBGranuleSummary nativeSummary = *in_summaries++; + fdb::GranuleSummary summary(nativeSummary); + out.push_back(summary); + } + return out; +}; + +TmpFile::~TmpFile() { + if (!filename.empty()) { + remove(); + } +} + +void TmpFile::create(std::string_view dir, std::string_view prefix) { + while (true) { + filename = fmt::format("{}/{}-{}", dir, prefix, Random::get().randomStringLowerCase(6, 6)); + if (!std::filesystem::exists(std::filesystem::path(filename))) { + break; + } + } + + // Create an empty tmp file + std::fstream tmpFile(filename, std::fstream::out); + if (!tmpFile.good()) { + throw TesterError(fmt::format("Failed to create temporary file {}\n", filename)); + } +} + +void TmpFile::write(std::string_view data) { + std::ofstream ofs(filename, std::fstream::out | std::fstream::binary); + if (!ofs.good()) { + throw TesterError(fmt::format("Failed to write to the temporary file {}\n", filename)); + } + ofs.write(data.data(), data.size()); +} + +void TmpFile::remove() { + if (!std::filesystem::remove(std::filesystem::path(filename))) { + fmt::print(stderr, "Failed to remove file {}\n", filename); + } +} + } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h index de5e5c8990..1ace2c9721 100644 --- a/bindings/c/test/apitester/TesterUtil.h +++ b/bindings/c/test/apitester/TesterUtil.h @@ -66,7 +66,20 @@ public: int randomInt(int min, int max); - fdb::ByteString randomStringLowerCase(int minLength, int maxLength); + template + StringType randomStringLowerCase(int minLength, int maxLength) { + int length = randomInt(minLength, maxLength); + StringType str; + str.reserve(length); + for (int i = 0; i < length; i++) { + str += (char)randomInt('a', 'z'); + } + return str; + } + + fdb::ByteString randomByteStringLowerCase(int minLength, int maxLength) { + return randomStringLowerCase(minLength, maxLength); + } bool randomBool(double trueRatio); @@ -120,6 +133,9 @@ KeyValueArray copyKeyValueArray(fdb::future_var::KeyValueRefArray::Type array); using KeyRangeArray = std::vector; KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array); +using GranuleSummaryArray = std::vector; +GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefArray::Type array); + static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems"); // Converts a little-endian encoded number into an integral type. @@ -139,6 +155,19 @@ static fdb::ByteString toByteString(T value) { return output; } +// Creates a temporary file; file gets destroyed/deleted along with object destruction. +struct TmpFile { +public: + ~TmpFile(); + void create(std::string_view dir, std::string_view prefix); + void write(std::string_view data); + void remove(); + const std::string& getFileName() const { return filename; } + +private: + std::string filename; +}; + } // namespace FdbApiTester #endif diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index cbce118f10..8e7289f437 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -80,13 +80,14 @@ bool WorkloadConfig::getBoolOption(const std::string& name, bool defaultVal) con WorkloadBase::WorkloadBase(const WorkloadConfig& config) : manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients), - failed(false), numTxCompleted(0) { + failed(false), numTxCompleted(0), numTxStarted(0), inProgress(false) { maxErrors = config.getIntOption("maxErrors", 10); workloadId = fmt::format("{}{}", config.name, clientId); } void WorkloadBase::init(WorkloadManager* manager) { this->manager = manager; + inProgress = true; } void WorkloadBase::printStats() { @@ -94,6 +95,7 @@ void WorkloadBase::printStats() { } void WorkloadBase::schedule(TTaskFct task) { + ASSERT(inProgress); if (failed) { return; } @@ -104,28 +106,49 @@ void WorkloadBase::schedule(TTaskFct task) { }); } -void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError) { +void WorkloadBase::execTransaction(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant, + bool failOnError) { + doExecute(startFct, cont, tenant, failOnError, true); +} + +// Execute a non-transactional database operation within the workload +void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) { + doExecute(startFct, cont, {}, failOnError, false); +} + +void WorkloadBase::doExecute(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant, + bool failOnError, + bool transactional) { + ASSERT(inProgress); if (failed) { return; } tasksScheduled++; - manager->txExecutor->execute(tx, [this, tx, cont, failOnError]() { - numTxCompleted++; - fdb::Error err = tx->getError(); - if (err.code() == error_code_success) { - cont(); - } else { - std::string msg = fmt::format("Transaction failed with error: {} ({})", err.code(), err.what()); - if (failOnError) { - error(msg); - failed = true; - } else { - info(msg); - cont(); - } - } - scheduledTaskDone(); - }); + numTxStarted++; + manager->txExecutor->execute( + startFct, + [this, startFct, cont, failOnError](fdb::Error err) { + numTxCompleted++; + if (err.code() == error_code_success) { + cont(); + } else { + std::string msg = fmt::format("Transaction failed with error: {} ({})", err.code(), err.what()); + if (failOnError) { + error(msg); + failed = true; + } else { + info(msg); + cont(); + } + } + scheduledTaskDone(); + }, + tenant, + transactional); } void WorkloadBase::info(const std::string& msg) { @@ -143,11 +166,13 @@ void WorkloadBase::error(const std::string& msg) { void WorkloadBase::scheduledTaskDone() { if (--tasksScheduled == 0) { + inProgress = false; if (numErrors > 0) { error(fmt::format("Workload failed with {} errors", numErrors.load())); } else { info("Workload successfully completed"); } + ASSERT(numTxStarted == numTxCompleted); manager->workloadDone(this, numErrors > 0); } } @@ -165,8 +190,11 @@ void WorkloadManager::add(std::shared_ptr workload, TTaskFct cont) { void WorkloadManager::run() { std::vector> initialWorkloads; - for (auto iter : workloads) { - initialWorkloads.push_back(iter.second.ref); + { + std::unique_lock lock(mutex); + for (auto iter : workloads) { + initialWorkloads.push_back(iter.second.ref); + } } for (auto iter : initialWorkloads) { iter->init(this); @@ -324,4 +352,4 @@ std::unordered_map& IWorkloadFactory::factories( return theFactories; } -} // namespace FdbApiTester \ No newline at end of file +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index beb3082c5c..ea1c6816f9 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -82,6 +82,9 @@ struct WorkloadConfig { // Total number of clients int numClients; + // Number of Tenants + int numTenants; + // Selected FDB API version int apiVersion; @@ -116,12 +119,13 @@ protected: void schedule(TTaskFct task); // Execute a transaction within the workload - void execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError = true); + void execTransaction(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant = std::optional(), + bool failOnError = true); - // Execute a transaction within the workload, a convenience method for a tranasaction defined by a lambda function - void execTransaction(TTxStartFct start, TTaskFct cont, bool failOnError = true) { - execTransaction(std::make_shared(start), cont, failOnError); - } + // Execute a non-transactional database operation within the workload + void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true); // Log an error message, increase error counter void error(const std::string& msg); @@ -135,6 +139,12 @@ protected: private: WorkloadManager* manager; + void doExecute(TOpStartFct startFct, + TTaskFct cont, + std::optional tenant, + bool failOnError, + bool transactional); + // Decrease scheduled task counter, notify the workload manager // that the task is done if no more tasks schedule void scheduledTaskDone(); @@ -164,6 +174,12 @@ protected: // Number of completed transactions std::atomic numTxCompleted; + + // Number of started transactions + std::atomic numTxStarted; + + // Workload is in progress (intialized, but not completed) + std::atomic inProgress; }; // Workload manager diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml new file mode 100644 index 0000000000..788bd04d85 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml @@ -0,0 +1,22 @@ +[[test]] +title = 'Blob Granule Errors Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml new file mode 100644 index 0000000000..788bd04d85 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml @@ -0,0 +1,22 @@ +[[test]] +title = 'Blob Granule Errors Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml new file mode 100644 index 0000000000..85e78975f6 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml @@ -0,0 +1,15 @@ +[[test]] +title = 'Blob Granule Errors Single Threaded' +minClients = 1 +maxClients = 3 +multiThreaded = false + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 5a86678cec..1d79dd754c 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -36,6 +36,8 @@ namespace FdbApiTester { namespace { +#define API_VERSION_CLIENT_TMP_DIR 720 + enum TesterOptionId { OPT_CONNFILE, OPT_HELP, @@ -46,6 +48,7 @@ enum TesterOptionId { OPT_KNOB, OPT_EXTERNAL_CLIENT_LIBRARY, OPT_EXTERNAL_CLIENT_DIRECTORY, + OPT_FUTURE_VERSION_CLIENT_LIBRARY, OPT_TMP_DIR, OPT_DISABLE_LOCAL_CLIENT, OPT_TEST_FILE, @@ -72,6 +75,7 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, { OPT_EXTERNAL_CLIENT_DIRECTORY, "--external-client-dir", SO_REQ_SEP }, + { OPT_FUTURE_VERSION_CLIENT_LIBRARY, "--future-version-client-library", SO_REQ_SEP }, { OPT_TMP_DIR, "--tmp-dir", SO_REQ_SEP }, { OPT_DISABLE_LOCAL_CLIENT, "--disable-local-client", SO_NONE }, { OPT_TEST_FILE, "-f", SO_REQ_SEP }, @@ -110,6 +114,8 @@ void printProgramUsage(const char* execName) { " Path to the external client library.\n" " --external-client-dir DIR\n" " Directory containing external client libraries.\n" + " --future-version-client-library FILE\n" + " Path to a client library to be used with a future protocol version.\n" " --tmp-dir DIR\n" " Directory for temporary files of the client.\n" " --disable-local-client DIR\n" @@ -204,6 +210,9 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { case OPT_EXTERNAL_CLIENT_DIRECTORY: options.externalClientDir = args.OptionArg(); break; + case OPT_FUTURE_VERSION_CLIENT_LIBRARY: + options.futureVersionClientLibrary = args.OptionArg(); + break; case OPT_TMP_DIR: options.tmpDir = args.OptionArg(); break; @@ -278,7 +287,7 @@ void fdb_check(fdb::Error e) { } void applyNetworkOptions(TesterOptions& options) { - if (!options.tmpDir.empty()) { + if (!options.tmpDir.empty() && options.apiVersion >= API_VERSION_CLIENT_TMP_DIR) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_TMP_DIR, options.tmpDir); } if (!options.externalClientLibrary.empty()) { @@ -296,6 +305,11 @@ void applyNetworkOptions(TesterOptions& options) { } } + if (!options.futureVersionClientLibrary.empty()) { + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_FUTURE_VERSION_CLIENT_LIBRARY, + options.futureVersionClientLibrary); + } + if (options.testSpec.multiThreaded) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads); } @@ -308,6 +322,10 @@ void applyNetworkOptions(TesterOptions& options) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE); } + if (options.testSpec.disableClientBypass && options.apiVersion >= 720) { + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_CLIENT_BYPASS); + } + if (options.trace) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir); fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, options.traceFormat); @@ -338,6 +356,12 @@ void randomizeOptions(TesterOptions& options) { options.numClientThreads = random.randomInt(options.testSpec.minClientThreads, options.testSpec.maxClientThreads); options.numDatabases = random.randomInt(options.testSpec.minDatabases, options.testSpec.maxDatabases); options.numClients = random.randomInt(options.testSpec.minClients, options.testSpec.maxClients); + + // Choose a random number of tenants. If a test is configured to allow 0 tenants, then use 0 tenants half the time. + if (options.testSpec.maxTenants >= options.testSpec.minTenants && + (options.testSpec.minTenants > 0 || random.randomBool(0.5))) { + options.numTenants = random.randomInt(options.testSpec.minTenants, options.testSpec.maxTenants); + } } bool runWorkloads(TesterOptions& options) { @@ -346,7 +370,12 @@ bool runWorkloads(TesterOptions& options) { txExecOptions.blockOnFutures = options.testSpec.blockOnFutures; txExecOptions.numDatabases = options.numDatabases; txExecOptions.databasePerTransaction = options.testSpec.databasePerTransaction; + // 7.1 and older releases crash on database create errors + txExecOptions.injectDatabaseCreateErrors = options.testSpec.buggify && options.apiVersion > 710; txExecOptions.transactionRetryLimit = options.transactionRetryLimit; + txExecOptions.tmpDir = options.tmpDir.empty() ? std::string("/tmp") : options.tmpDir; + txExecOptions.tamperClusterFile = options.testSpec.tamperClusterFile; + txExecOptions.numTenants = options.numTenants; std::vector> workloads; workloads.reserve(options.testSpec.workloads.size() * options.numClients); @@ -358,6 +387,7 @@ bool runWorkloads(TesterOptions& options) { config.options = workloadSpec.options; config.clientId = i; config.numClients = options.numClients; + config.numTenants = options.numTenants; config.apiVersion = options.apiVersion; std::shared_ptr workload = IWorkloadFactory::create(workloadSpec.name, config); if (!workload) { @@ -419,7 +449,7 @@ int main(int argc, char** argv) { } randomizeOptions(options); - fdb::selectApiVersion(options.apiVersion); + fdb::selectApiVersionCapped(options.apiVersion); applyNetworkOptions(options); fdb::network::setup(); diff --git a/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml new file mode 100644 index 0000000000..9e6fc350ea --- /dev/null +++ b/bindings/c/test/apitester/local_tests/CApiCorrectnessSingleThr.toml @@ -0,0 +1,29 @@ +[[test]] +title = 'API Correctness Single Threaded' +minClients = 1 +maxClients = 3 +minDatabases = 1 +maxDatabases = 3 +multiThreaded = false +disableClientBypass = true + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 + + [[test.workload]] + name = 'WatchAndWait' + initialSize = 0 + numRandomOperations = 10 diff --git a/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml new file mode 100644 index 0000000000..2a5a0d30e1 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiMultiTenantCorrectnessMultiThr.toml @@ -0,0 +1,21 @@ +[[test]] +title = 'Multi-tenant API Correctness Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minClients = 2 +maxClients = 8 +minTenants = 2 +maxTenants = 5 + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 5 + initialSize = 100 + numRandomOperations = 200 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml new file mode 100644 index 0000000000..60a9715bd8 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiTamperClusterFile.toml @@ -0,0 +1,24 @@ +[[test]] +title = 'Test tampering the cluster file' +multiThreaded = true +buggify = true +tamperClusterFile = true +minFdbThreads = 2 +maxFdbThreads = 4 +minDatabases = 2 +maxDatabases = 4 +minClientThreads = 2 +maxClientThreads = 4 +minClients = 2 +maxClients = 4 + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml new file mode 100644 index 0000000000..84531ea9c8 --- /dev/null +++ b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml @@ -0,0 +1,23 @@ +[[test]] +title = 'Mixed Workload for Upgrade Tests with a Multi-Threaded Client' +multiThreaded = true +buggify = true +databasePerTransaction = false +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'ApiBlobGranuleCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml index 86e65c5918..94bf4e0509 100644 --- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml +++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml @@ -32,4 +32,14 @@ maxClients = 8 maxKeysPerTransaction = 50 initialSize = 100 runUntilStop = true - readExistingKeysRatio = 0.9 \ No newline at end of file + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + runUntilStop = true + + [[test.workload]] + name = 'WatchAndWait' + initialSize = 0 + runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml index 42df76521b..daf070b31b 100644 --- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml +++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml @@ -30,4 +30,14 @@ maxClients = 8 maxKeysPerTransaction = 50 initialSize = 100 runUntilStop = true - readExistingKeysRatio = 0.9 \ No newline at end of file + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + runUntilStop = true + + [[test.workload]] + name = 'WatchAndWait' + initialSize = 0 + runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/client_memory_test.cpp b/bindings/c/test/client_memory_test.cpp index c6cff85574..3ea2f74a8a 100644 --- a/bindings/c/test/client_memory_test.cpp +++ b/bindings/c/test/client_memory_test.cpp @@ -44,7 +44,7 @@ int main(int argc, char** argv) { if (argc != 2) { printf("Usage: %s ", argv[0]); } - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); fdb_check(fdb_setup_network()); std::thread network_thread{ &fdb_run_network }; diff --git a/bindings/c/test/fdb_api.hpp b/bindings/c/test/fdb_api.hpp index 5dede1bbf3..5403f59b78 100644 --- a/bindings/c/test/fdb_api.hpp +++ b/bindings/c/test/fdb_api.hpp @@ -46,6 +46,8 @@ namespace native { #include } +#define TENANT_API_VERSION_GUARD 720 + using ByteString = std::basic_string; using BytesRef = std::basic_string_view; using CharsRef = std::string_view; @@ -62,6 +64,22 @@ struct KeyRange { Key beginKey; Key endKey; }; +struct GranuleSummary { + KeyRange keyRange; + int64_t snapshotVersion; + int64_t snapshotSize; + int64_t deltaVersion; + int64_t deltaSize; + + GranuleSummary(const native::FDBGranuleSummary& nativeSummary) { + keyRange.beginKey = fdb::Key(nativeSummary.key_range.begin_key, nativeSummary.key_range.begin_key_length); + keyRange.endKey = fdb::Key(nativeSummary.key_range.end_key, nativeSummary.key_range.end_key_length); + snapshotVersion = nativeSummary.snapshot_version; + snapshotSize = nativeSummary.snapshot_size; + deltaVersion = nativeSummary.delta_version; + deltaSize = nativeSummary.delta_size; + } +}; inline uint8_t const* toBytePtr(char const* ptr) noexcept { return reinterpret_cast(ptr); @@ -114,7 +132,7 @@ public: explicit Error(CodeType err) noexcept : err(err) {} - char const* what() noexcept { return native::fdb_get_error(err); } + char const* what() const noexcept { return native::fdb_get_error(err); } explicit operator bool() const noexcept { return err != 0; } @@ -137,6 +155,13 @@ struct None { struct Type {}; static Error extract(native::FDBFuture*, Type&) noexcept { return Error(0); } }; +struct Bool { + using Type = native::fdb_bool_t; + static Error extract(native::FDBFuture* f, Type& out) noexcept { + auto err = native::fdb_future_get_bool(f, &out); + return Error(err); + } +}; struct Int64 { using Type = int64_t; static Error extract(native::FDBFuture* f, Type& out) noexcept { @@ -200,6 +225,27 @@ struct KeyRangeRefArray { } }; +struct GranuleSummaryRef : native::FDBGranuleSummary { + fdb::KeyRef beginKey() const noexcept { + return fdb::KeyRef(native::FDBGranuleSummary::key_range.begin_key, + native::FDBGranuleSummary::key_range.begin_key_length); + } + fdb::KeyRef endKey() const noexcept { + return fdb::KeyRef(native::FDBGranuleSummary::key_range.end_key, + native::FDBGranuleSummary::key_range.end_key_length); + } +}; + +struct GranuleSummaryRefArray { + using Type = std::tuple; + static Error extract(native::FDBFuture* f, Type& out) noexcept { + auto& [out_summaries, out_count] = out; + auto err = native::fdb_future_get_granule_summary_array( + f, reinterpret_cast(&out_summaries), &out_count); + return Error(err); + } +}; + } // namespace future_var [[noreturn]] inline void throwError(std::string_view preamble, Error err) { @@ -310,6 +356,7 @@ public: class Future { protected: friend class Transaction; + friend class Database; friend std::hash; std::shared_ptr f; @@ -468,6 +515,14 @@ public: Transaction(const Transaction&) noexcept = default; Transaction& operator=(const Transaction&) noexcept = default; + void atomic_store(Transaction other) { std::atomic_store(&tr, other.tr); } + + Transaction atomic_load() { + Transaction retVal; + retVal.tr = std::atomic_load(&tr); + return retVal; + } + bool valid() const noexcept { return tr != nullptr; } explicit operator bool() const noexcept { return valid(); } @@ -559,9 +614,9 @@ public: reverse); } - TypedFuture getBlobGranuleRanges(KeyRef begin, KeyRef end) { + TypedFuture getBlobGranuleRanges(KeyRef begin, KeyRef end, int rangeLimit) { return native::fdb_transaction_get_blob_granule_ranges( - tr.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit); } Result readBlobGranules(KeyRef begin, @@ -573,6 +628,14 @@ public: tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), begin_version, read_version, context)); } + TypedFuture summarizeBlobGranules(KeyRef begin, + KeyRef end, + int64_t summaryVersion, + int rangeLimit) { + return native::fdb_transaction_summarize_blob_granules( + tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), summaryVersion, rangeLimit); + } + TypedFuture watch(KeyRef key) { return native::fdb_transaction_watch(tr.get(), key.data(), intSize(key)); } @@ -621,6 +684,7 @@ public: static void createTenant(Transaction tr, BytesRef name) { tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef()); tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef()); + tr.setOption(FDBTransactionOption::FDB_TR_OPTION_RAW_ACCESS, BytesRef()); tr.set(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), BytesRef()); } @@ -662,6 +726,14 @@ public: } Database() noexcept : db(nullptr) {} + void atomic_store(Database other) { std::atomic_store(&db, other.db); } + + Database atomic_load() { + Database retVal; + retVal.db = std::atomic_load(&db); + return retVal; + } + Error setOptionNothrow(FDBDatabaseOption option, int64_t value) noexcept { return Error(native::fdb_database_set_option( db.get(), option, reinterpret_cast(&value), static_cast(sizeof(value)))); @@ -707,10 +779,50 @@ public: throwError("Failed to create transaction: ", err); return Transaction(tx_native); } + + TypedFuture listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) { + if (!db) + throw std::runtime_error("listBlobbifiedRanges from null database"); + return native::fdb_database_list_blobbified_ranges( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit); + } + + TypedFuture verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) { + if (!db) + throw std::runtime_error("verifyBlobRange from null database"); + return native::fdb_database_verify_blob_range( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version); + } + + TypedFuture blobbifyRange(KeyRef begin, KeyRef end) { + if (!db) + throw std::runtime_error("blobbifyRange from null database"); + return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + } + + TypedFuture unblobbifyRange(KeyRef begin, KeyRef end) { + if (!db) + throw std::runtime_error("unblobbifyRange from null database"); + return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + } + + TypedFuture purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) { + if (!db) + throw std::runtime_error("purgeBlobGranules from null database"); + native::fdb_bool_t forceBool = force; + return native::fdb_database_purge_blob_granules( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool); + } + + TypedFuture waitPurgeGranulesComplete(KeyRef purgeKey) { + if (!db) + throw std::runtime_error("purgeBlobGranules from null database"); + return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey)); + } }; inline Error selectApiVersionNothrow(int version) { - if (version < 720) { + if (version < TENANT_API_VERSION_GUARD) { Tenant::tenantManagementMapPrefix = "\xff\xff/management/tenant_map/"; } return Error(native::fdb_select_api_version(version)); @@ -722,6 +834,20 @@ inline void selectApiVersion(int version) { } } +inline Error selectApiVersionCappedNothrow(int version) { + if (version < TENANT_API_VERSION_GUARD) { + Tenant::tenantManagementMapPrefix = "\xff\xff/management/tenant_map/"; + } + return Error( + native::fdb_select_api_version_impl(version, std::min(native::fdb_get_max_api_version(), FDB_API_VERSION))); +} + +inline void selectApiVersionCapped(int version) { + if (auto err = selectApiVersionCappedNothrow(version)) { + throwError(fmt::format("ERROR: fdb_select_api_version_capped({}): ", version), err); + } +} + } // namespace fdb template <> diff --git a/bindings/c/test/fdb_c90_test.c b/bindings/c/test/fdb_c90_test.c index e2011286ed..1bcdf63284 100644 --- a/bindings/c/test/fdb_c90_test.c +++ b/bindings/c/test/fdb_c90_test.c @@ -4,6 +4,6 @@ int main(int argc, char* argv[]) { (void)argc; (void)argv; - fdb_select_api_version(720); + fdb_select_api_version(FDB_API_VERSION); return 0; } diff --git a/bindings/c/test/fdb_c_shim_tests.py b/bindings/c/test/fdb_c_shim_tests.py index 138d94415f..b18ca3968d 100644 --- a/bindings/c/test/fdb_c_shim_tests.py +++ b/bindings/c/test/fdb_c_shim_tests.py @@ -7,13 +7,18 @@ import subprocess import sys import os -sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..', '..', '..', 'tests', 'TestRunner')] +sys.path[:0] = [os.path.join(os.path.dirname( + __file__), '..', '..', '..', 'tests', 'TestRunner')] + +# fmt: off from binary_download import FdbBinaryDownloader, CURRENT_VERSION from local_cluster import LocalCluster, random_secret_string +# fmt: on LAST_RELEASE_VERSION = "7.1.5" TESTER_STATS_INTERVAL_SEC = 5 DEFAULT_TEST_FILE = "CApiCorrectnessMultiThr.toml" +IMPLIBSO_ERROR_CODE = -6 # SIGABORT def version_from_str(ver_str): @@ -55,7 +60,8 @@ class TestEnv(LocalCluster): self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version)) client_lib = self.downloader.lib_path(version) assert client_lib.exists(), "{} does not exist".format(client_lib) - self.client_lib_external = self.tmp_dir.joinpath("libfdb_c_external.so") + self.client_lib_external = self.tmp_dir.joinpath( + "libfdb_c_external.so") shutil.copyfile(client_lib, self.client_lib_external) def __enter__(self): @@ -91,6 +97,9 @@ class FdbCShimTests: assert self.unit_tests_bin.exists(), "{} does not exist".format(self.unit_tests_bin) self.api_tester_bin = Path(args.api_tester_bin).resolve() assert self.api_tester_bin.exists(), "{} does not exist".format(self.api_tests_bin) + self.shim_lib_tester_bin = Path(args.shim_lib_tester_bin).resolve() + assert self.shim_lib_tester_bin.exists( + ), "{} does not exist".format(self.shim_lib_tester_bin) self.api_test_dir = Path(args.api_test_dir).resolve() assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir) self.downloader = FdbBinaryDownloader(args.build_dir) @@ -98,6 +107,7 @@ class FdbCShimTests: self.platform = platform.machine() if (self.platform == "x86_64"): self.downloader.download_old_binaries(LAST_RELEASE_VERSION) + self.downloader.download_old_binaries("7.0.0") def build_c_api_tester_args(self, test_env, test_file): test_file_path = self.api_test_dir.joinpath(test_file) @@ -128,7 +138,8 @@ class FdbCShimTests: with TestEnv(self.build_dir, self.downloader, version) as test_env: cmd_args = self.build_c_api_tester_args(test_env, test_file) env_vars = os.environ.copy() - env_vars["LD_LIBRARY_PATH"] = self.downloader.lib_dir(version) + env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path( + version) test_env.exec_client_command(cmd_args, env_vars) def run_c_unit_tests(self, version): @@ -143,38 +154,118 @@ class FdbCShimTests: test_env.client_lib_external ] env_vars = os.environ.copy() - env_vars["LD_LIBRARY_PATH"] = self.downloader.lib_dir(version) + env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path( + version) test_env.exec_client_command(cmd_args, env_vars) - def test_invalid_c_client_lib_env_var(self, version): + def run_c_shim_lib_tester( + self, + version, + test_env, + api_version=None, + invalid_lib_path=False, + call_set_path=False, + set_env_path=False, + set_ld_lib_path=False, + use_external_lib=True, + expected_ret_code=0 + ): print('-' * 80) - print("Test invalid FDB_C_CLIENT_LIBRARY_PATH value") + if api_version is None: + api_version = api_version_from_str(version) + test_flags = [] + if invalid_lib_path: + test_flags.append("invalid_lib_path") + if call_set_path: + test_flags.append("call_set_path") + if set_ld_lib_path: + test_flags.append("set_ld_lib_path") + if use_external_lib: + test_flags.append("use_external_lib") + else: + test_flags.append("use_local_lib") + print("C Shim Tests - version: {}, API version: {}, {}".format(version, + api_version, ", ".join(test_flags))) print('-' * 80) - with TestEnv(self.build_dir, self.downloader, version) as test_env: - cmd_args = self.build_c_api_tester_args(test_env, DEFAULT_TEST_FILE) - env_vars = os.environ.copy() - env_vars["FDB_C_CLIENT_LIBRARY_PATH"] = "dummy" - test_env.exec_client_command(cmd_args, env_vars, 1) - - def test_valid_c_client_lib_env_var(self, version): - print('-' * 80) - print("Test valid FDB_C_CLIENT_LIBRARY_PATH value") - print('-' * 80) - with TestEnv(self.build_dir, self.downloader, version) as test_env: - cmd_args = self.build_c_api_tester_args(test_env, DEFAULT_TEST_FILE) - env_vars = os.environ.copy() - env_vars["FDB_C_CLIENT_LIBRARY_PATH"] = self.downloader.lib_path(version) - test_env.exec_client_command(cmd_args, env_vars) + cmd_args = [ + self.shim_lib_tester_bin, + "--cluster-file", + test_env.cluster_file, + "--api-version", + str(api_version), + ] + if call_set_path: + cmd_args = cmd_args + [ + "--local-client-library", + ("dummy" if invalid_lib_path else self.downloader.lib_path(version)) + ] + if use_external_lib: + cmd_args = cmd_args + [ + "--disable-local-client", + "--external-client-library", + test_env.client_lib_external + ] + env_vars = os.environ.copy() + env_vars["LD_LIBRARY_PATH"] = ( + self.downloader.lib_dir(version) if set_ld_lib_path else "") + if set_env_path: + env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = ( + "dummy" if invalid_lib_path else self.downloader.lib_path(version)) + test_env.exec_client_command(cmd_args, env_vars, expected_ret_code) def run_tests(self): + # Test the API workload with the dev version + self.run_c_api_test(CURRENT_VERSION, DEFAULT_TEST_FILE) + + # Run unit tests with the dev version + self.run_c_unit_tests(CURRENT_VERSION) + + with TestEnv(self.build_dir, self.downloader, CURRENT_VERSION) as test_env: + # Test lookup of the client library over LD_LIBRARY_PATH + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, set_ld_lib_path=True) + + # Test setting the client library path over an API call + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, call_set_path=True) + + # Test setting the client library path over an environment variable + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, set_env_path=True) + + # Test using the loaded client library as the local client + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, call_set_path=True, use_external_lib=False) + + # Test setting an invalid client library path over an API call + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, call_set_path=True, invalid_lib_path=True, expected_ret_code=IMPLIBSO_ERROR_CODE) + + # Test setting an invalid client library path over an environment variable + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, set_env_path=True, invalid_lib_path=True, expected_ret_code=IMPLIBSO_ERROR_CODE) + + # Test calling a function that exists in the loaded library, but not for the selected API version + self.run_c_shim_lib_tester( + CURRENT_VERSION, test_env, call_set_path=True, api_version=700) + # binary downloads are currently available only for x86_64 - if (self.platform == "x86_64"): + if self.platform == "x86_64": + # Test the API workload with the release version self.run_c_api_test(LAST_RELEASE_VERSION, DEFAULT_TEST_FILE) - self.run_c_api_test(CURRENT_VERSION, DEFAULT_TEST_FILE) - self.run_c_unit_tests(CURRENT_VERSION) - self.test_invalid_c_client_lib_env_var(CURRENT_VERSION) - self.test_valid_c_client_lib_env_var(CURRENT_VERSION) + with TestEnv(self.build_dir, self.downloader, LAST_RELEASE_VERSION) as test_env: + # Test using the loaded client library as the local client + self.run_c_shim_lib_tester( + LAST_RELEASE_VERSION, test_env, call_set_path=True, use_external_lib=False) + + # Test the client library of the release version in combination with the dev API version + self.run_c_shim_lib_tester( + LAST_RELEASE_VERSION, test_env, call_set_path=True, api_version=api_version_from_str(CURRENT_VERSION), expected_ret_code=1) + + # Test calling a function that does not exist in the loaded library + self.run_c_shim_lib_tester( + "7.0.0", test_env, call_set_path=True, api_version=700, expected_ret_code=IMPLIBSO_ERROR_CODE) if __name__ == "__main__": @@ -194,12 +285,26 @@ if __name__ == "__main__": help="FDB build directory", required=True, ) - parser.add_argument('--unit-tests-bin', type=str, - help='Path to the fdb_c_shim_unit_tests executable.') - parser.add_argument('--api-tester-bin', type=str, - help='Path to the fdb_c_shim_api_tester executable.') - parser.add_argument('--api-test-dir', type=str, - help='Path to a directory with api test definitions.') + parser.add_argument( + '--unit-tests-bin', + type=str, + help='Path to the fdb_c_shim_unit_tests executable.', + required=True) + parser.add_argument( + '--api-tester-bin', + type=str, + help='Path to the fdb_c_shim_api_tester executable.', + required=True) + parser.add_argument( + '--shim-lib-tester-bin', + type=str, + help='Path to the fdb_c_shim_lib_tester executable.', + required=True) + parser.add_argument( + '--api-test-dir', + type=str, + help='Path to a directory with api test definitions.', + required=True) args = parser.parse_args() test = FdbCShimTests(args) test.run_tests() diff --git a/bindings/c/test/mako/blob_granules.cpp b/bindings/c/test/mako/blob_granules.cpp index af805f2e56..1071737211 100644 --- a/bindings/c/test/mako/blob_granules.cpp +++ b/bindings/c/test/mako/blob_granules.cpp @@ -26,6 +26,9 @@ extern thread_local mako::Logger logr; +// FIXME: use the same implementation as the api tester! this implementation was from back when mako was written in C +// and is inferior. + namespace mako::blob_granules::local_file { int64_t startLoad(const char* filename, diff --git a/bindings/c/test/performance_test.c b/bindings/c/test/performance_test.c index 5cd9f64bc0..ab90395e05 100644 --- a/bindings/c/test/performance_test.c +++ b/bindings/c/test/performance_test.c @@ -641,7 +641,7 @@ void runTests(struct ResultSet* rs) { int main(int argc, char** argv) { srand(time(NULL)); struct ResultSet* rs = newResultSet(); - checkError(fdb_select_api_version(720), "select API version", rs); + checkError(fdb_select_api_version(FDB_API_VERSION), "select API version", rs); printf("Running performance test at client version: %s\n", fdb_get_client_version()); valueStr = (uint8_t*)malloc((sizeof(uint8_t)) * valueSize); diff --git a/bindings/c/test/ryw_benchmark.c b/bindings/c/test/ryw_benchmark.c index cf2754bcec..731a2ce0a2 100644 --- a/bindings/c/test/ryw_benchmark.c +++ b/bindings/c/test/ryw_benchmark.c @@ -285,7 +285,7 @@ void runTests(struct ResultSet* rs) { int main(int argc, char** argv) { srand(time(NULL)); struct ResultSet* rs = newResultSet(); - checkError(fdb_select_api_version(720), "select API version", rs); + checkError(fdb_select_api_version(FDB_API_VERSION), "select API version", rs); printf("Running RYW Benchmark test at client version: %s\n", fdb_get_client_version()); keys = generateKeys(numKeys, keySize); diff --git a/bindings/c/test/shim_lib_tester.cpp b/bindings/c/test/shim_lib_tester.cpp new file mode 100644 index 0000000000..c8813c45d0 --- /dev/null +++ b/bindings/c/test/shim_lib_tester.cpp @@ -0,0 +1,253 @@ +/* + * shim_lib_tester.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A utility for testing shim library usage with various valid and invalid configurations + */ + +#include "fmt/core.h" +#include "test/fdb_api.hpp" +#include "SimpleOpt/SimpleOpt.h" +#include +#include +#include "foundationdb/fdb_c_shim.h" + +#undef ERROR +#define ERROR(name, number, description) enum { error_code_##name = number }; + +#include "flow/error_definitions.h" + +using namespace std::string_view_literals; + +namespace { + +enum TesterOptionId { + OPT_HELP, + OPT_CONNFILE, + OPT_LOCAL_CLIENT_LIBRARY, + OPT_EXTERNAL_CLIENT_LIBRARY, + OPT_EXTERNAL_CLIENT_DIRECTORY, + OPT_DISABLE_LOCAL_CLIENT, + OPT_API_VERSION +}; + +const int MIN_TESTABLE_API_VERSION = 400; + +CSimpleOpt::SOption TesterOptionDefs[] = // + { { OPT_HELP, "-h", SO_NONE }, + { OPT_HELP, "--help", SO_NONE }, + { OPT_CONNFILE, "-C", SO_REQ_SEP }, + { OPT_CONNFILE, "--cluster-file", SO_REQ_SEP }, + { OPT_LOCAL_CLIENT_LIBRARY, "--local-client-library", SO_REQ_SEP }, + { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, + { OPT_EXTERNAL_CLIENT_DIRECTORY, "--external-client-dir", SO_REQ_SEP }, + { OPT_DISABLE_LOCAL_CLIENT, "--disable-local-client", SO_NONE }, + { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, + SO_END_OF_OPTIONS }; + +class TesterOptions { +public: + // FDB API version, using the latest version by default + int apiVersion = FDB_API_VERSION; + std::string clusterFile; + std::string localClientLibrary; + std::string externalClientLibrary; + std::string externalClientDir; + bool disableLocalClient = false; +}; + +void printProgramUsage(const char* execName) { + printf("usage: %s [OPTIONS]\n" + "\n", + execName); + printf(" -C, --cluster-file FILE\n" + " The path of a file containing the connection string for the\n" + " FoundationDB cluster. The default is `fdb.cluster'\n" + " --local-client-library FILE\n" + " Path to the local client library.\n" + " --external-client-library FILE\n" + " Path to the external client library.\n" + " --external-client-dir DIR\n" + " Directory containing external client libraries.\n" + " --disable-local-client DIR\n" + " Disable the local client, i.e. use only external client libraries.\n" + " --api-version VERSION\n" + " Required FDB API version (default %d).\n" + " -h, --help Display this help and exit.\n", + FDB_API_VERSION); +} + +bool processIntOption(const std::string& optionName, const std::string& value, int minValue, int maxValue, int& res) { + char* endptr; + res = strtol(value.c_str(), &endptr, 10); + if (*endptr != '\0') { + fmt::print(stderr, "Invalid value {} for {}", value, optionName); + return false; + } + if (res < minValue || res > maxValue) { + fmt::print(stderr, "Value for {} must be between {} and {}", optionName, minValue, maxValue); + return false; + } + return true; +} + +bool processArg(TesterOptions& options, const CSimpleOpt& args) { + switch (args.OptionId()) { + case OPT_CONNFILE: + options.clusterFile = args.OptionArg(); + break; + case OPT_LOCAL_CLIENT_LIBRARY: + options.localClientLibrary = args.OptionArg(); + break; + case OPT_EXTERNAL_CLIENT_LIBRARY: + options.externalClientLibrary = args.OptionArg(); + break; + case OPT_EXTERNAL_CLIENT_DIRECTORY: + options.externalClientDir = args.OptionArg(); + break; + case OPT_DISABLE_LOCAL_CLIENT: + options.disableLocalClient = true; + break; + case OPT_API_VERSION: + if (!processIntOption( + args.OptionText(), args.OptionArg(), MIN_TESTABLE_API_VERSION, FDB_API_VERSION, options.apiVersion)) { + return false; + } + break; + } + return true; +} + +bool parseArgs(TesterOptions& options, int argc, char** argv) { + // declare our options parser, pass in the arguments from main + // as well as our array of valid options. + CSimpleOpt args(argc, argv, TesterOptionDefs); + + // while there are arguments left to process + while (args.Next()) { + if (args.LastError() == SO_SUCCESS) { + if (args.OptionId() == OPT_HELP) { + printProgramUsage(argv[0]); + return false; + } + if (!processArg(options, args)) { + return false; + } + } else { + fmt::print(stderr, "ERROR: Invalid argument: {}\n", args.OptionText()); + printProgramUsage(argv[0]); + return false; + } + } + return true; +} + +void fdb_check(fdb::Error e, std::string_view msg, fdb::Error::CodeType expectedError = error_code_success) { + if (e.code()) { + fmt::print(stderr, "{}, Error: {}({})\n", msg, e.code(), e.what()); + std::abort(); + } +} + +void applyNetworkOptions(TesterOptions& options) { + if (!options.externalClientLibrary.empty()) { + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT); + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, + options.externalClientLibrary); + } else if (!options.externalClientDir.empty()) { + if (options.disableLocalClient) { + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT); + } + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_DIRECTORY, options.externalClientDir); + } else { + if (options.disableLocalClient) { + fmt::print(stderr, "Invalid options: Cannot disable local client if no external library is provided"); + exit(1); + } + } +} + +void testBasicApi(const TesterOptions& options) { + fdb::Database db(options.clusterFile); + fdb::Transaction tx = db.createTransaction(); + while (true) { + try { + // Set a time out to avoid long delays when testing invalid configurations + tx.setOption(FDB_TR_OPTION_TIMEOUT, 1000); + tx.set(fdb::toBytesRef("key1"sv), fdb::toBytesRef("val1"sv)); + fdb_check(tx.commit().blockUntilReady(), "Wait on commit failed"); + break; + } catch (const fdb::Error& err) { + if (err.code() == error_code_timed_out) { + exit(1); + } + auto onErrorFuture = tx.onError(err); + fdb_check(onErrorFuture.blockUntilReady(), "Wait on onError failed"); + fdb_check(onErrorFuture.error(), "onError failed"); + } + } +} + +void test710Api(const TesterOptions& options) { + fdb::Database db(options.clusterFile); + try { + db.openTenant(fdb::toBytesRef("not_existing_tenant"sv)); + } catch (const fdb::Error& err) { + fdb_check(err, "Tenant not found expected", error_code_tenant_not_found); + } +} + +} // namespace + +int main(int argc, char** argv) { + int retCode = 0; + try { + TesterOptions options; + if (!parseArgs(options, argc, argv)) { + return 1; + } + + if (!options.localClientLibrary.empty()) { + // Must be called before the first FDB API call + fdb_shim_set_local_client_library_path(options.localClientLibrary.c_str()); + } + + fdb::selectApiVersionCapped(options.apiVersion); + applyNetworkOptions(options); + fdb::network::setup(); + + std::thread network_thread{ &fdb::network::run }; + + // Try calling some basic functionality that is available + // in all recent API versions + testBasicApi(options); + + // Try calling 710-specific API. This enables testing what + // happens if a library is missing a function + test710Api(options); + + fdb_check(fdb::network::stop(), "Stop network failed"); + network_thread.join(); + } catch (const std::runtime_error& err) { + fmt::print(stderr, "runtime error caught: {}\n", err.what()); + retCode = 1; + } + return retCode; +} diff --git a/bindings/c/test/txn_size_test.c b/bindings/c/test/txn_size_test.c index 97081f24a6..57c74a9bca 100644 --- a/bindings/c/test/txn_size_test.c +++ b/bindings/c/test/txn_size_test.c @@ -97,7 +97,7 @@ void runTests(struct ResultSet* rs) { int main(int argc, char** argv) { srand(time(NULL)); struct ResultSet* rs = newResultSet(); - checkError(fdb_select_api_version(720), "select API version", rs); + checkError(fdb_select_api_version(FDB_API_VERSION), "select API version", rs); printf("Running performance test at client version: %s\n", fdb_get_client_version()); keys = generateKeys(numKeys, KEY_SIZE); diff --git a/bindings/c/test/unit/disconnected_timeout_tests.cpp b/bindings/c/test/unit/disconnected_timeout_tests.cpp index b1c6b72730..7d006faa23 100644 --- a/bindings/c/test/unit/disconnected_timeout_tests.cpp +++ b/bindings/c/test/unit/disconnected_timeout_tests.cpp @@ -255,7 +255,7 @@ int main(int argc, char** argv) { << std::endl; return 1; } - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); if (argc >= 3) { std::string externalClientLibrary = argv[2]; if (externalClientLibrary.substr(0, 2) != "--") { diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index d3c1dec30d..1376fc77c7 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -84,6 +84,12 @@ void Future::cancel() { return fdb_future_get_keyrange_array(future_, out_keyranges, out_count); } +// GranuleSummaryArrayFuture + +[[nodiscard]] fdb_error_t GranuleSummaryArrayFuture::get(const FDBGranuleSummary** out_summaries, int* out_count) { + return fdb_future_get_granule_summary_array(future_, out_summaries, out_count); +} + // KeyValueArrayFuture [[nodiscard]] fdb_error_t KeyValueArrayFuture::get(const FDBKeyValue** out_kv, int* out_count, fdb_bool_t* out_more) { @@ -356,10 +362,17 @@ fdb_error_t Transaction::add_conflict_range(std::string_view begin_key, tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size(), type); } -KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key) { - return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges( - tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size())); +KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key, + std::string_view end_key, + int rangeLimit) { + return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges(tr_, + (const uint8_t*)begin_key.data(), + begin_key.size(), + (const uint8_t*)end_key.data(), + end_key.size(), + rangeLimit)); } + KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key, std::string_view end_key, int64_t beginVersion, @@ -375,4 +388,17 @@ KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key, granuleContext)); } +GranuleSummaryArrayFuture Transaction::summarize_blob_granules(std::string_view begin_key, + std::string_view end_key, + int64_t summary_version, + int rangeLimit) { + return GranuleSummaryArrayFuture(fdb_transaction_summarize_blob_granules(tr_, + (const uint8_t*)begin_key.data(), + begin_key.size(), + (const uint8_t*)end_key.data(), + end_key.size(), + summary_version, + rangeLimit)); +} + } // namespace fdb diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index 7d44a30a9a..137083a90c 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -161,6 +161,18 @@ private: KeyRangeArrayFuture(FDBFuture* f) : Future(f) {} }; +class GranuleSummaryArrayFuture : public Future { +public: + // Call this function instead of fdb_future_get_granule_summary_array when using + // the GranuleSummaryArrayFuture type. It's behavior is identical to + // fdb_future_get_granule_summary_array. + fdb_error_t get(const FDBGranuleSummary** out_summaries, int* out_count); + +private: + friend class Transaction; + GranuleSummaryArrayFuture(FDBFuture* f) : Future(f) {} +}; + class EmptyFuture : public Future { private: friend class Transaction; @@ -348,12 +360,16 @@ public: // Wrapper around fdb_transaction_add_conflict_range. fdb_error_t add_conflict_range(std::string_view begin_key, std::string_view end_key, FDBConflictRangeType type); - KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key); + KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key, int rangeLimit); KeyValueArrayResult read_blob_granules(std::string_view begin_key, std::string_view end_key, int64_t beginVersion, int64_t endVersion, FDBReadBlobGranuleContext granule_context); + GranuleSummaryArrayFuture summarize_blob_granules(std::string_view begin_key, + std::string_view end_key, + int64_t summaryVersion, + int rangeLimit); private: FDBTransaction* tr_; diff --git a/bindings/c/test/unit/setup_tests.cpp b/bindings/c/test/unit/setup_tests.cpp index 6ac65b7850..2e96eb00b9 100644 --- a/bindings/c/test/unit/setup_tests.cpp +++ b/bindings/c/test/unit/setup_tests.cpp @@ -42,13 +42,13 @@ TEST_CASE("setup") { CHECK(err); // Select current API version - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); // Error to call again after a successful return - err = fdb_select_api_version(720); + err = fdb_select_api_version(FDB_API_VERSION); CHECK(err); - CHECK(fdb_get_max_api_version() >= 720); + CHECK(fdb_get_max_api_version() >= FDB_API_VERSION); fdb_check(fdb_setup_network()); // Calling a second time should fail diff --git a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp index 810cc066fd..73dc8132a5 100644 --- a/bindings/c/test/unit/trace_partial_file_suffix_test.cpp +++ b/bindings/c/test/unit/trace_partial_file_suffix_test.cpp @@ -53,7 +53,7 @@ bool file_exists(const char* path) { } int main(int argc, char** argv) { - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); std::string file_identifier = "trace_partial_file_suffix_test" + std::to_string(std::random_device{}()); std::string trace_partial_file_suffix = ".tmp"; diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 60af78c885..2d8dc2456f 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -941,13 +941,13 @@ static Value dataOfRecord(const int i) { return Value(format("data-of-record-%08d", i)); } static std::string indexEntryKey(const int i) { - return Tuple().append(StringRef(prefix)).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack().toString(); + return Tuple::makeTuple(prefix, INDEX, indexKey(i), primaryKey(i)).pack().toString(); } static std::string recordKey(const int i, const int split) { - return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack().toString(); + return Tuple::makeTuple(prefix, RECORD, primaryKey(i), split).pack().toString(); } static std::string recordValue(const int i, const int split) { - return Tuple().append(dataOfRecord(i)).append(split).pack().toString(); + return Tuple::makeTuple(dataOfRecord(i), split).pack().toString(); } const static int SPLIT_SIZE = 3; @@ -993,13 +993,8 @@ GetMappedRangeResult getMappedIndexEntries(int beginId, fdb::Transaction& tr, int matchIndex, bool allMissing) { - std::string mapper = Tuple() - .append(prefix) - .append(RECORD) - .append(allMissing ? "{K[2]}"_sr : "{K[3]}"_sr) - .append("{...}"_sr) - .pack() - .toString(); + std::string mapper = + Tuple::makeTuple(prefix, RECORD, (allMissing ? "{K[2]}"_sr : "{K[3]}"_sr), "{...}"_sr).pack().toString(); return getMappedIndexEntries(beginId, endId, tr, mapper, matchIndex); } @@ -1037,7 +1032,7 @@ TEST_CASE("tuple_support_versionstamp") { // a random 12 bytes long StringRef as a versionstamp StringRef str = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12"_sr; Versionstamp vs(str); - const Tuple t = Tuple().append(prefix).append(RECORD).appendVersionstamp(vs).append("{K[3]}"_sr).append("{...}"_sr); + const Tuple t = Tuple::makeTuple(prefix, RECORD, vs, "{K[3]}"_sr, "{...}"_sr); ASSERT(t.getVersionstamp(2) == vs); // verify the round-way pack-unpack path for a Tuple containing a versionstamp @@ -1181,7 +1176,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_missing_all_secondary") { } TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") { - std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString(); + std::string mapper = Tuple::makeTuple(prefix, RECORD, "{K[3]}"_sr).pack().toString(); fdb::Transaction tr(db); auto result = get_mapped_range( tr, @@ -1200,7 +1195,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") { } TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_ryw_enable") { - std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString(); + std::string mapper = Tuple::makeTuple(prefix, RECORD, "{K[3]}"_sr).pack().toString(); fdb::Transaction tr(db); fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); // Not disable RYW auto result = get_mapped_range( @@ -2766,6 +2761,7 @@ TEST_CASE("Blob Granule Functions") { auto confValue = get_value("\xff/conf/blob_granules_enabled", /* snapshot */ false, { FDB_TR_OPTION_READ_SYSTEM_KEYS }); if (!confValue.has_value() || confValue.value() != "1") { + // std::cout << "skipping blob granule test" << std::endl; return; } @@ -2822,7 +2818,6 @@ TEST_CASE("Blob Granule Functions") { fdb::KeyValueArrayResult r = tr.read_blob_granules(key("bg"), key("bh"), originalReadVersion, -2, granuleContext); fdb_error_t err = r.get(&out_kv, &out_count, &out_more); - ; if (err && err != 2037 /* blob_granule_not_materialized */) { fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); @@ -2858,7 +2853,7 @@ TEST_CASE("Blob Granule Functions") { // test ranges while (1) { - fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh")); + fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"), 1000); fdb_error_t err = wait_future(f); if (err) { fdb::EmptyFuture f2 = tr.on_error(err); @@ -2870,6 +2865,10 @@ TEST_CASE("Blob Granule Functions") { int out_count; fdb_check(f.get(&out_kr, &out_count)); + CHECK(std::string((const char*)out_kr[0].begin_key, out_kr[0].begin_key_length) <= key("bg")); + CHECK(std::string((const char*)out_kr[out_count - 1].end_key, out_kr[out_count - 1].end_key_length) >= + key("bh")); + CHECK(out_count >= 1); // check key ranges are in order for (int i = 0; i < out_count; i++) { @@ -2877,9 +2876,9 @@ TEST_CASE("Blob Granule Functions") { CHECK(std::string((const char*)out_kr[i].begin_key, out_kr[i].begin_key_length) < std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length)); } - // Ranges themselves are sorted + // Ranges themselves are sorted and contiguous for (int i = 0; i < out_count - 1; i++) { - CHECK(std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length) <= + CHECK(std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length) == std::string((const char*)out_kr[i + 1].begin_key, out_kr[i + 1].begin_key_length)); } @@ -2905,7 +2904,6 @@ TEST_CASE("Blob Granule Functions") { fdb_check(wait_future(waitPurgeFuture)); // re-read again at the purge version to make sure it is still valid - while (1) { fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); fdb::KeyValueArrayResult r = @@ -2922,6 +2920,56 @@ TEST_CASE("Blob Granule Functions") { tr.reset(); break; } + + // check granule summary + while (1) { + fdb::GranuleSummaryArrayFuture f = tr.summarize_blob_granules(key("bg"), key("bh"), originalReadVersion, 100); + fdb_error_t err = wait_future(f); + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + const FDBGranuleSummary* out_summaries; + int out_count; + fdb_check(f.get(&out_summaries, &out_count)); + + CHECK(out_count >= 1); + CHECK(out_count <= 100); + + // check that ranges cover requested range + CHECK(std::string((const char*)out_summaries[0].key_range.begin_key, + out_summaries[0].key_range.begin_key_length) <= key("bg")); + CHECK(std::string((const char*)out_summaries[out_count - 1].key_range.end_key, + out_summaries[out_count - 1].key_range.end_key_length) >= key("bh")); + + // check key ranges are in order + for (int i = 0; i < out_count; i++) { + // key range start < end + CHECK(std::string((const char*)out_summaries[i].key_range.begin_key, + out_summaries[i].key_range.begin_key_length) < + std::string((const char*)out_summaries[i].key_range.end_key, + out_summaries[i].key_range.end_key_length)); + // sanity check versions and sizes + CHECK(out_summaries[i].snapshot_version <= originalReadVersion); + CHECK(out_summaries[i].delta_version <= originalReadVersion); + CHECK(out_summaries[i].snapshot_version <= out_summaries[i].delta_version); + CHECK(out_summaries[i].snapshot_size > 0); + CHECK(out_summaries[i].delta_size >= 0); + } + + // Ranges themselves are sorted and contiguous + for (int i = 0; i < out_count - 1; i++) { + CHECK(std::string((const char*)out_summaries[i].key_range.end_key, + out_summaries[i].key_range.end_key_length) == + std::string((const char*)out_summaries[i + 1].key_range.begin_key, + out_summaries[i + 1].key_range.begin_key_length)); + } + + tr.reset(); + break; + } } int main(int argc, char** argv) { @@ -2931,7 +2979,7 @@ int main(int argc, char** argv) { << std::endl; return 1; } - fdb_check(fdb_select_api_version(720)); + fdb_check(fdb_select_api_version(FDB_API_VERSION)); if (argc >= 4) { std::string externalClientLibrary = argv[3]; if (externalClientLibrary.substr(0, 2) != "--") { diff --git a/bindings/c/test/workloads/SimpleWorkload.cpp b/bindings/c/test/workloads/SimpleWorkload.cpp index 95be0ddd59..4dd7a800fe 100644 --- a/bindings/c/test/workloads/SimpleWorkload.cpp +++ b/bindings/c/test/workloads/SimpleWorkload.cpp @@ -266,7 +266,7 @@ struct SimpleWorkload final : FDBWorkload { insertsPerTx = context->getOption("insertsPerTx", 100ul); opsPerTx = context->getOption("opsPerTx", 100ul); runFor = context->getOption("runFor", 10.0); - auto err = fdb_select_api_version(720); + auto err = fdb_select_api_version(FDB_API_VERSION); if (err) { context->trace( FDBSeverity::Info, "SelectAPIVersionFailed", { { "Error", std::string(fdb_get_error(err)) } }); diff --git a/bindings/flow/DirectoryLayer.actor.cpp b/bindings/flow/DirectoryLayer.actor.cpp index 056b203a2e..13d3970ed3 100644 --- a/bindings/flow/DirectoryLayer.actor.cpp +++ b/bindings/flow/DirectoryLayer.actor.cpp @@ -23,17 +23,17 @@ namespace FDB { const uint8_t DirectoryLayer::LITTLE_ENDIAN_LONG_ONE[8] = { 1, 0, 0, 0, 0, 0, 0, 0 }; -const StringRef DirectoryLayer::HIGH_CONTENTION_KEY = LiteralStringRef("hca"); -const StringRef DirectoryLayer::LAYER_KEY = LiteralStringRef("layer"); -const StringRef DirectoryLayer::VERSION_KEY = LiteralStringRef("version"); +const StringRef DirectoryLayer::HIGH_CONTENTION_KEY = "hca"_sr; +const StringRef DirectoryLayer::LAYER_KEY = "layer"_sr; +const StringRef DirectoryLayer::VERSION_KEY = "version"_sr; const int64_t DirectoryLayer::SUB_DIR_KEY = 0; const uint32_t DirectoryLayer::VERSION[3] = { 1, 0, 0 }; -const StringRef DirectoryLayer::DEFAULT_NODE_SUBSPACE_PREFIX = LiteralStringRef("\xfe"); +const StringRef DirectoryLayer::DEFAULT_NODE_SUBSPACE_PREFIX = "\xfe"_sr; const Subspace DirectoryLayer::DEFAULT_NODE_SUBSPACE = Subspace(DEFAULT_NODE_SUBSPACE_PREFIX); const Subspace DirectoryLayer::DEFAULT_CONTENT_SUBSPACE = Subspace(); -const StringRef DirectoryLayer::PARTITION_LAYER = LiteralStringRef("partition"); +const StringRef DirectoryLayer::PARTITION_LAYER = "partition"_sr; DirectoryLayer::DirectoryLayer(Subspace nodeSubspace, Subspace contentSubspace, bool allowManualPrefixes) : rootNode(nodeSubspace.get(nodeSubspace.key())), nodeSubspace(nodeSubspace), contentSubspace(contentSubspace), diff --git a/bindings/flow/FDBLoanerTypes.h b/bindings/flow/FDBLoanerTypes.h index 01000f6e27..ddd9a577b5 100644 --- a/bindings/flow/FDBLoanerTypes.h +++ b/bindings/flow/FDBLoanerTypes.h @@ -31,7 +31,7 @@ typedef Standalone Key; typedef Standalone Value; inline Key keyAfter(const KeyRef& key) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; Standalone r; @@ -43,7 +43,7 @@ inline Key keyAfter(const KeyRef& key) { } inline KeyRef keyAfter(const KeyRef& key, Arena& arena) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; uint8_t* t = new (arena) uint8_t[key.size() + 1]; memcpy(t, key.begin(), key.size()); diff --git a/bindings/flow/Tuple.h b/bindings/flow/Tuple.h index 4d903e5bb8..a7feab9419 100644 --- a/bindings/flow/Tuple.h +++ b/bindings/flow/Tuple.h @@ -63,7 +63,9 @@ struct Tuple { Tuple& appendNull(); Tuple& appendVersionstamp(Versionstamp const&); - StringRef pack() const { return StringRef(data.begin(), data.size()); } + Standalone pack() const { + return Standalone(StringRef(data.begin(), data.size()), data.arena()); + } template Tuple& operator<<(T const& t) { diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp index 72ee49dcf4..d6e1431c77 100644 --- a/bindings/flow/fdb_flow.actor.cpp +++ b/bindings/flow/fdb_flow.actor.cpp @@ -38,7 +38,7 @@ THREAD_FUNC networkThread(void* fdb) { } ACTOR Future _test() { - API* fdb = FDB::API::selectAPIVersion(720); + API* fdb = FDB::API::selectAPIVersion(FDB_API_VERSION); auto db = fdb->createDatabase(); state Reference tr = db->createTransaction(); @@ -63,15 +63,14 @@ ACTOR Future _test() { // wait( waitForAllReady( versions ) ); printf("Elapsed: %lf\n", timer_monotonic() - starttime); - tr->set(LiteralStringRef("foo"), LiteralStringRef("bar")); + tr->set("foo"_sr, "bar"_sr); - Optional> v = wait(tr->get(LiteralStringRef("foo"))); + Optional> v = wait(tr->get("foo"_sr)); if (v.present()) { printf("%s\n", v.get().toString().c_str()); } - FDBStandalone r = - wait(tr->getRange(KeyRangeRef(LiteralStringRef("a"), LiteralStringRef("z")), 100)); + FDBStandalone r = wait(tr->getRange(KeyRangeRef("a"_sr, "z"_sr), 100)); for (auto kv : r) { printf("%s is %s\n", kv.key.toString().c_str(), kv.value.toString().c_str()); @@ -82,7 +81,7 @@ ACTOR Future _test() { } void fdb_flow_test() { - API* fdb = FDB::API::selectAPIVersion(720); + API* fdb = FDB::API::selectAPIVersion(FDB_API_VERSION); fdb->setupNetwork(); startThread(networkThread, fdb); diff --git a/bindings/flow/tester/DirectoryTester.actor.cpp b/bindings/flow/tester/DirectoryTester.actor.cpp index b21da1097c..a8fabdca4c 100644 --- a/bindings/flow/tester/DirectoryTester.actor.cpp +++ b/bindings/flow/tester/DirectoryTester.actor.cpp @@ -545,11 +545,10 @@ struct DirectoryLogDirectoryFunc : InstructionFunc { pathTuple.append(p, true); } - instruction->tr->set(logSubspace.pack(LiteralStringRef("path"), true), pathTuple.pack()); - instruction->tr->set(logSubspace.pack(LiteralStringRef("layer"), true), - Tuple().append(directory->getLayer()).pack()); - instruction->tr->set(logSubspace.pack(LiteralStringRef("exists"), true), Tuple().append(exists ? 1 : 0).pack()); - instruction->tr->set(logSubspace.pack(LiteralStringRef("children"), true), childrenTuple.pack()); + instruction->tr->set(logSubspace.pack("path"_sr, true), pathTuple.pack()); + instruction->tr->set(logSubspace.pack("layer"_sr, true), Tuple().append(directory->getLayer()).pack()); + instruction->tr->set(logSubspace.pack("exists"_sr, true), Tuple().append(exists ? 1 : 0).pack()); + instruction->tr->set(logSubspace.pack("children"_sr, true), childrenTuple.pack()); return Void(); } diff --git a/bindings/flow/tester/Tester.actor.cpp b/bindings/flow/tester/Tester.actor.cpp index 941e1b97b2..f300127e5d 100644 --- a/bindings/flow/tester/Tester.actor.cpp +++ b/bindings/flow/tester/Tester.actor.cpp @@ -470,12 +470,12 @@ ACTOR Future> waitForVoid(Future f) { try { wait(f); Tuple t; - t.append(LiteralStringRef("RESULT_NOT_PRESENT")); + t.append("RESULT_NOT_PRESENT"_sr); return t.pack(); } catch (Error& e) { // printf("FDBError1:%d\n", e.code()); Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", e.code())); // pack above as error string into another tuple Tuple ret; @@ -493,7 +493,7 @@ ACTOR Future> waitForValue(Future> f } catch (Error& e) { // printf("FDBError2:%d\n", e.code()); Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", e.code())); // pack above as error string into another tuple Tuple ret; @@ -509,7 +509,7 @@ ACTOR Future> waitForValue(Future> waitForValue(Future> getKey(Future> f, Stan } catch (Error& e) { // printf("FDBError4:%d\n", e.code()); Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", e.code())); // pack above as error string into another tuple Tuple ret; @@ -670,7 +670,7 @@ struct GetEstimatedRangeSize : InstructionFunc { state Standalone endKey = Tuple::unpack(s2).getString(0); Future fsize = instruction->tr->getEstimatedRangeSizeBytes(KeyRangeRef(beginKey, endKey)); int64_t size = wait(fsize); - data->stack.pushTuple(LiteralStringRef("GOT_ESTIMATED_RANGE_SIZE")); + data->stack.pushTuple("GOT_ESTIMATED_RANGE_SIZE"_sr); return Void(); } @@ -698,7 +698,7 @@ struct GetRangeSplitPoints : InstructionFunc { Future>> fsplitPoints = instruction->tr->getRangeSplitPoints(KeyRangeRef(beginKey, endKey), chunkSize); FDBStandalone> splitPoints = wait(fsplitPoints); - data->stack.pushTuple(LiteralStringRef("GOT_RANGE_SPLIT_POINTS")); + data->stack.pushTuple("GOT_RANGE_SPLIT_POINTS"_sr); return Void(); } @@ -743,7 +743,7 @@ struct GetReadVersionFunc : InstructionFunc { ACTOR static Future call(Reference data, Reference instruction) { Version v = wait(instruction->tr->getReadVersion()); data->lastVersion = v; - data->stack.pushTuple(LiteralStringRef("GOT_READ_VERSION")); + data->stack.pushTuple("GOT_READ_VERSION"_sr); return Void(); } }; @@ -767,7 +767,7 @@ struct GetCommittedVersionFunc : InstructionFunc { static Future call(Reference const& data, Reference const& instruction) { data->lastVersion = instruction->tr->getCommittedVersion(); - data->stack.pushTuple(LiteralStringRef("GOT_COMMITTED_VERSION")); + data->stack.pushTuple("GOT_COMMITTED_VERSION"_sr); return Void(); } }; @@ -781,7 +781,7 @@ struct GetApproximateSizeFunc : InstructionFunc { ACTOR static Future call(Reference data, Reference instruction) { int64_t _ = wait(instruction->tr->getApproximateSize()); (void)_; // disable unused variable warning - data->stack.pushTuple(LiteralStringRef("GOT_APPROXIMATE_SIZE")); + data->stack.pushTuple("GOT_APPROXIMATE_SIZE"_sr); return Void(); } }; @@ -1485,7 +1485,7 @@ struct ReadConflictKeyFunc : InstructionFunc { // printf("=========READ_CONFLICT_KEY:%s\n", printable(key).c_str()); instruction->tr->addReadConflictKey(key); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_KEY")); + data->stack.pushTuple("SET_CONFLICT_KEY"_sr); return Void(); } }; @@ -1506,7 +1506,7 @@ struct WriteConflictKeyFunc : InstructionFunc { // printf("=========WRITE_CONFLICT_KEY:%s\n", printable(key).c_str()); instruction->tr->addWriteConflictKey(key); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_KEY")); + data->stack.pushTuple("SET_CONFLICT_KEY"_sr); return Void(); } }; @@ -1529,7 +1529,7 @@ struct ReadConflictRangeFunc : InstructionFunc { // printf("=========READ_CONFLICT_RANGE:%s:%s\n", printable(begin).c_str(), printable(end).c_str()); instruction->tr->addReadConflictRange(KeyRange(KeyRangeRef(begin, end))); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_RANGE")); + data->stack.pushTuple("SET_CONFLICT_RANGE"_sr); return Void(); } }; @@ -1553,7 +1553,7 @@ struct WriteConflictRangeFunc : InstructionFunc { // printf("=========WRITE_CONFLICT_RANGE:%s:%s\n", printable(begin).c_str(), printable(end).c_str()); instruction->tr->addWriteConflictRange(KeyRange(KeyRangeRef(begin, end))); - data->stack.pushTuple(LiteralStringRef("SET_CONFLICT_RANGE")); + data->stack.pushTuple("SET_CONFLICT_RANGE"_sr); return Void(); } }; @@ -1643,10 +1643,8 @@ struct UnitTestsFunc : InstructionFunc { Optional(StringRef((const uint8_t*)&locationCacheSize, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MAX_WATCHES, Optional(StringRef((const uint8_t*)&maxWatches, 8))); - data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, - Optional(LiteralStringRef("dc_id"))); - data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, - Optional(LiteralStringRef("machine_id"))); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, Optional("dc_id"_sr)); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, Optional("machine_id"_sr)); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, @@ -1685,13 +1683,13 @@ struct UnitTestsFunc : InstructionFunc { Optional(StringRef((const uint8_t*)&maxRetryDelay, 8))); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_USED_DURING_COMMIT_PROTECTION_DISABLE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TRANSACTION_LOGGING_ENABLE, - Optional(LiteralStringRef("my_transaction"))); + Optional("my_transaction"_sr)); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_LOCK_AWARE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_INCLUDE_PORT_IN_ADDRESS); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_REPORT_CONFLICTING_KEYS); - Optional> _ = wait(tr->get(LiteralStringRef("\xff"))); + Optional> _ = wait(tr->get("\xff"_sr)); tr->cancel(); return Void(); @@ -1724,13 +1722,13 @@ ACTOR static Future doInstructions(Reference data) { Tuple opTuple = Tuple::unpack(data->instructions[idx].value); state Standalone op = opTuple.getString(0); - state bool isDatabase = op.endsWith(LiteralStringRef("_DATABASE")); - state bool isSnapshot = op.endsWith(LiteralStringRef("_SNAPSHOT")); - state bool isDirectory = op.startsWith(LiteralStringRef("DIRECTORY_")); + state bool isDatabase = op.endsWith("_DATABASE"_sr); + state bool isSnapshot = op.endsWith("_SNAPSHOT"_sr); + state bool isDirectory = op.startsWith("DIRECTORY_"_sr); try { if (LOG_INSTRUCTIONS) { - if (op != LiteralStringRef("SWAP") && op != LiteralStringRef("PUSH")) { + if (op != "SWAP"_sr && op != "PUSH"_sr) { printf("%zu. %s\n", idx, tupleToString(opTuple).c_str()); fflush(stdout); } @@ -1773,7 +1771,7 @@ ACTOR static Future doInstructions(Reference data) { if (opsThatCreateDirectories.count(op.toString())) { data->directoryData.directoryList.push_back(DirectoryOrSubspace()); } - data->stack.pushTuple(LiteralStringRef("DIRECTORY_ERROR")); + data->stack.pushTuple("DIRECTORY_ERROR"_sr); } else { data->stack.pushError(e.code()); } @@ -1873,7 +1871,7 @@ ACTOR void _test_versionstamp() { try { g_network = newNet2(TLSConfig()); - API* fdb = FDB::API::selectAPIVersion(720); + API* fdb = FDB::API::selectAPIVersion(FDB_API_VERSION); fdb->setupNetwork(); startThread(networkThread, fdb); @@ -1883,15 +1881,14 @@ ACTOR void _test_versionstamp() { state Future> ftrVersion = tr->getVersionstamp(); - tr->atomicOp(LiteralStringRef("foo"), - LiteralStringRef("blahblahbl\x00\x00\x00\x00"), - FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE); + tr->atomicOp( + "foo"_sr, "blahblahbl\x00\x00\x00\x00"_sr, FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE); wait(tr->commit()); // should use retry loop tr->reset(); - Optional> optionalDbVersion = wait(tr->get(LiteralStringRef("foo"))); + Optional> optionalDbVersion = wait(tr->get("foo"_sr)); state FDBStandalone dbVersion = optionalDbVersion.get(); FDBStandalone trVersion = wait(ftrVersion); diff --git a/bindings/flow/tester/Tester.actor.h b/bindings/flow/tester/Tester.actor.h index 63fc9fe9a3..f42f8d74ee 100644 --- a/bindings/flow/tester/Tester.actor.h +++ b/bindings/flow/tester/Tester.actor.h @@ -71,7 +71,7 @@ struct FlowTesterStack { void pushError(int errorCode) { FDB::Tuple t; - t.append(LiteralStringRef("ERROR")); + t.append("ERROR"_sr); t.append(format("%d", errorCode)); // pack above as error string into another tuple pushTuple(t.pack().toString()); diff --git a/bindings/go/src/fdb/fdb.go b/bindings/go/src/fdb/fdb.go index 7800dd9bf5..e308049be0 100644 --- a/bindings/go/src/fdb/fdb.go +++ b/bindings/go/src/fdb/fdb.go @@ -128,7 +128,7 @@ func APIVersion(version int) error { return errAPIVersionAlreadySet } - if version < 200 || version > 720 { + if version < 200 || version > headerVersion { return errAPIVersionNotSupported } diff --git a/bindings/go/src/fdb/fdb_test.go b/bindings/go/src/fdb/fdb_test.go index 976a3ec9d0..00b3f41304 100644 --- a/bindings/go/src/fdb/fdb_test.go +++ b/bindings/go/src/fdb/fdb_test.go @@ -29,10 +29,12 @@ import ( "github.com/apple/foundationdb/bindings/go/src/fdb" ) +const API_VERSION int = 720 + func ExampleOpenDefault() { var e error - e = fdb.APIVersion(720) + e = fdb.APIVersion(API_VERSION) if e != nil { fmt.Printf("Unable to set API version: %v\n", e) return @@ -52,7 +54,7 @@ func ExampleOpenDefault() { } func TestVersionstamp(t *testing.T) { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() setVs := func(t fdb.Transactor, key fdb.Key) (fdb.FutureKey, error) { @@ -98,7 +100,7 @@ func TestVersionstamp(t *testing.T) { } func TestReadTransactionOptions(t *testing.T) { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() _, e := db.ReadTransact(func(rtr fdb.ReadTransaction) (interface{}, error) { rtr.Options().SetAccessSystemKeys() @@ -110,7 +112,7 @@ func TestReadTransactionOptions(t *testing.T) { } func ExampleTransactor() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() setOne := func(t fdb.Transactor, key fdb.Key, value []byte) error { @@ -161,7 +163,7 @@ func ExampleTransactor() { } func ExampleReadTransactor() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() getOne := func(rt fdb.ReadTransactor, key fdb.Key) ([]byte, error) { @@ -214,7 +216,7 @@ func ExampleReadTransactor() { } func ExamplePrefixRange() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() tr, e := db.CreateTransaction() @@ -253,7 +255,7 @@ func ExamplePrefixRange() { } func ExampleRangeIterator() { - fdb.MustAPIVersion(720) + fdb.MustAPIVersion(API_VERSION) db := fdb.MustOpenDefault() tr, e := db.CreateTransaction() diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index a3c0674e64..b765e09508 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -102,6 +102,11 @@ func (o NetworkOptions) SetTraceFileIdentifier(param string) error { return o.setOpt(36, []byte(param)) } +// Use the same base trace file name for all client threads as it did before version 7.2. The current default behavior is to use distinct trace file names for client threads by including their version and thread index. +func (o NetworkOptions) SetTraceShareAmongClientThreads() error { + return o.setOpt(37, nil) +} + // Set file suffix for partially written log files. // // Parameter: Append this suffix to partially written log files. When a log file is complete, it is renamed to remove the suffix. No separator is added between the file and the suffix. If you want to add a file extension, you should include the separator - e.g. '.tmp' instead of 'tmp' to add the 'tmp' extension. @@ -239,6 +244,13 @@ func (o NetworkOptions) SetClientThreadsPerVersion(param int64) error { return o.setOpt(65, int64ToBytes(param)) } +// Adds an external client library to be used with a future version protocol. This option can be used testing purposes only! +// +// Parameter: path to client library +func (o NetworkOptions) SetFutureVersionClientLibrary(param string) error { + return o.setOpt(66, []byte(param)) +} + // Disables logging of client statistics, such as sampled transaction activity. func (o NetworkOptions) SetDisableClientStatisticsLogging() error { return o.setOpt(70, nil) @@ -254,6 +266,11 @@ func (o NetworkOptions) SetEnableRunLoopProfiling() error { return o.setOpt(71, nil) } +// Prevents the multi-version client API from being disabled, even if no external clients are configured. This option is required to use GRV caching. +func (o NetworkOptions) SetDisableClientBypass() error { + return o.setOpt(72, nil) +} + // Enable client buggify - will make requests randomly fail (intended for client testing) func (o NetworkOptions) SetClientBuggifyEnable() error { return o.setOpt(80, nil) @@ -610,11 +627,18 @@ func (o TransactionOptions) SetBypassUnreadable() error { return o.setOpt(1100, nil) } -// Allows this transaction to use cached GRV from the database context. Defaults to off. Upon first usage, starts a background updater to periodically update the cache to avoid stale read versions. +// Allows this transaction to use cached GRV from the database context. Defaults to off. Upon first usage, starts a background updater to periodically update the cache to avoid stale read versions. The disable_client_bypass option must also be set. func (o TransactionOptions) SetUseGrvCache() error { return o.setOpt(1101, nil) } +// Attach given authorization token to the transaction such that subsequent tenant-aware requests are authorized +// +// Parameter: A JSON Web Token authorized to access data belonging to one or more tenants, indicated by 'tenants' claim of the token's payload. +func (o TransactionOptions) SetAuthorizationToken(param string) error { + return o.setOpt(2000, []byte(param)) +} + type StreamingMode int const ( diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 22564dccc8..7057f22384 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -34,9 +34,11 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/FDBDatabase.java src/main/com/apple/foundationdb/FDBTenant.java src/main/com/apple/foundationdb/FDBTransaction.java + src/main/com/apple/foundationdb/FutureBool.java src/main/com/apple/foundationdb/FutureInt64.java src/main/com/apple/foundationdb/FutureKey.java src/main/com/apple/foundationdb/FutureKeyArray.java + src/main/com/apple/foundationdb/FutureKeyRangeArray.java src/main/com/apple/foundationdb/FutureResult.java src/main/com/apple/foundationdb/FutureResults.java src/main/com/apple/foundationdb/FutureMappedResults.java @@ -56,6 +58,7 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/RangeQuery.java src/main/com/apple/foundationdb/MappedRangeQuery.java src/main/com/apple/foundationdb/KeyArrayResult.java + src/main/com/apple/foundationdb/KeyRangeArrayResult.java src/main/com/apple/foundationdb/RangeResult.java src/main/com/apple/foundationdb/MappedRangeResult.java src/main/com/apple/foundationdb/RangeResultInfo.java diff --git a/bindings/java/JavaWorkload.cpp b/bindings/java/JavaWorkload.cpp index e9bf309fa4..6c65313f4b 100644 --- a/bindings/java/JavaWorkload.cpp +++ b/bindings/java/JavaWorkload.cpp @@ -379,7 +379,7 @@ struct JVM { jmethodID selectMethod = env->GetStaticMethodID(fdbClass, "selectAPIVersion", "(I)Lcom/apple/foundationdb/FDB;"); checkException(); - auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(720)); + auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(FDB_API_VERSION)); checkException(); env->CallObjectMethod(fdbInstance, getMethod(fdbClass, "disableShutdownHook", "()V")); checkException(); diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index 04c87e3b19..660e6945fa 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -25,9 +25,11 @@ #include "com_apple_foundationdb_FDB.h" #include "com_apple_foundationdb_FDBDatabase.h" #include "com_apple_foundationdb_FDBTransaction.h" +#include "com_apple_foundationdb_FutureBool.h" #include "com_apple_foundationdb_FutureInt64.h" #include "com_apple_foundationdb_FutureKey.h" #include "com_apple_foundationdb_FutureKeyArray.h" +#include "com_apple_foundationdb_FutureKeyRangeArray.h" #include "com_apple_foundationdb_FutureResult.h" #include "com_apple_foundationdb_FutureResults.h" #include "com_apple_foundationdb_FutureStrings.h" @@ -55,7 +57,11 @@ static jclass mapped_range_result_class; static jclass mapped_key_value_class; static jclass string_class; static jclass key_array_result_class; +static jclass keyrange_class; +static jclass keyrange_array_result_class; static jmethodID key_array_result_init; +static jmethodID keyrange_init; +static jmethodID keyrange_array_result_init; static jmethodID range_result_init; static jmethodID mapped_range_result_init; static jmethodID mapped_key_value_from_bytes; @@ -278,6 +284,23 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_NativeFuture_Future_1releaseM fdb_future_release_memory(var); } +JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FutureBool_FutureBool_1get(JNIEnv* jenv, jobject, jlong future) { + if (!future) { + throwParamNotNull(jenv); + return 0; + } + FDBFuture* f = (FDBFuture*)future; + + fdb_bool_t value = false; + fdb_error_t err = fdb_future_get_bool(f, &value); + if (err) { + safeThrow(jenv, getThrowable(jenv, err)); + return 0; + } + + return (jboolean)value; +} + JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FutureInt64_FutureInt64_1get(JNIEnv* jenv, jobject, jlong future) { if (!future) { throwParamNotNull(jenv); @@ -407,6 +430,61 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyArray_FutureKeyAr return result; } +JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyRangeArray_FutureKeyRangeArray_1get(JNIEnv* jenv, + jobject, + jlong future) { + if (!future) { + throwParamNotNull(jenv); + return JNI_NULL; + } + + FDBFuture* f = (FDBFuture*)future; + + const FDBKeyRange* fdbKr; + int count; + fdb_error_t err = fdb_future_get_keyrange_array(f, &fdbKr, &count); + if (err) { + safeThrow(jenv, getThrowable(jenv, err)); + return JNI_NULL; + } + + jobjectArray kr_values = jenv->NewObjectArray(count, keyrange_class, NULL); + if (!kr_values) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + + for (int i = 0; i < count; i++) { + jbyteArray beginArr = jenv->NewByteArray(fdbKr[i].begin_key_length); + if (!beginArr) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + jbyteArray endArr = jenv->NewByteArray(fdbKr[i].end_key_length); + if (!endArr) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + jenv->SetByteArrayRegion(beginArr, 0, fdbKr[i].begin_key_length, (const jbyte*)fdbKr[i].begin_key); + jenv->SetByteArrayRegion(endArr, 0, fdbKr[i].end_key_length, (const jbyte*)fdbKr[i].end_key); + + jobject kr = jenv->NewObject(keyrange_class, keyrange_init, beginArr, endArr); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + jenv->SetObjectArrayElement(kr_values, i, kr); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + } + jobject krarr = jenv->NewObject(keyrange_array_result_class, keyrange_array_result_init, kr_values); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + + return krarr; +} + // SOMEDAY: explore doing this more efficiently with Direct ByteBuffers JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResults_1get(JNIEnv* jenv, jobject, @@ -765,6 +843,207 @@ JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getM return (jdouble)fdb_database_get_main_thread_busyness(database); } +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1purgeBlobGranules(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jlong purgeVersion, + jboolean force) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + + FDBDatabase* database = (FDBDatabase*)dbPtr; + + uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!beginKeyArr) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKeyArr) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_purge_blob_granules(database, + beginKeyArr, + jenv->GetArrayLength(beginKeyBytes), + endKeyArr, + jenv->GetArrayLength(endKeyBytes), + purgeVersion, + (fdb_bool_t)force); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL +Java_com_apple_foundationdb_FDBDatabase_Database_1waitPurgeGranulesComplete(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray purgeKeyBytes) { + if (!dbPtr || !purgeKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + FDBDatabase* database = (FDBDatabase*)dbPtr; + uint8_t* purgeKeyArr = (uint8_t*)jenv->GetByteArrayElements(purgeKeyBytes, JNI_NULL); + + if (!purgeKeyArr) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + FDBFuture* f = + fdb_database_wait_purge_granules_complete(database, purgeKeyArr, jenv->GetArrayLength(purgeKeyBytes)); + jenv->ReleaseByteArrayElements(purgeKeyBytes, (jbyte*)purgeKeyArr, JNI_ABORT); + + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1blobbifyRange(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + + FDBDatabase* database = (FDBDatabase*)dbPtr; + + uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!beginKeyArr) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKeyArr) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_blobbify_range( + database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes)); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1unblobbifyRange(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + + FDBDatabase* database = (FDBDatabase*)dbPtr; + + uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!beginKeyArr) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKeyArr) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_unblobbify_range( + database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes)); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1listBlobbifiedRanges(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jint rangeLimit) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + FDBDatabase* tr = (FDBDatabase*)dbPtr; + + uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!startKey) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKey) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_list_blobbified_ranges( + tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rangeLimit); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verifyBlobRange(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jlong version) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + FDBDatabase* tr = (FDBDatabase*)dbPtr; + + uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!startKey) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKey) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_verify_blob_range( + tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); + return (jlong)f; +} + JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv, jobject, jint predicate, @@ -1242,6 +1521,41 @@ Java_com_apple_foundationdb_FDBTransaction_Transaction_1getRangeSplitPoints(JNIE return (jlong)f; } +JNIEXPORT jlong JNICALL +Java_com_apple_foundationdb_FDBTransaction_Transaction_1getBlobGranuleRanges(JNIEnv* jenv, + jobject, + jlong tPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jint rowLimit) { + if (!tPtr || !beginKeyBytes || !endKeyBytes || !rowLimit) { + throwParamNotNull(jenv); + return 0; + } + FDBTransaction* tr = (FDBTransaction*)tPtr; + + uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!startKey) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKey) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_transaction_get_blob_granule_ranges( + tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rowLimit); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); + return (jlong)f; +} + JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1set(JNIEnv* jenv, jobject, jlong tPtr, @@ -1681,6 +1995,15 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { key_array_result_init = env->GetMethodID(local_key_array_result_class, "", "([B[I)V"); key_array_result_class = (jclass)(env)->NewGlobalRef(local_key_array_result_class); + jclass local_keyrange_class = env->FindClass("com/apple/foundationdb/Range"); + keyrange_init = env->GetMethodID(local_keyrange_class, "", "([B[B)V"); + keyrange_class = (jclass)(env)->NewGlobalRef(local_keyrange_class); + + jclass local_keyrange_array_result_class = env->FindClass("com/apple/foundationdb/KeyRangeArrayResult"); + keyrange_array_result_init = + env->GetMethodID(local_keyrange_array_result_class, "", "([Lcom/apple/foundationdb/Range;)V"); + keyrange_array_result_class = (jclass)(env)->NewGlobalRef(local_keyrange_array_result_class); + jclass local_range_result_summary_class = env->FindClass("com/apple/foundationdb/RangeResultSummary"); range_result_summary_init = env->GetMethodID(local_range_result_summary_class, "", "([BIZ)V"); range_result_summary_class = (jclass)(env)->NewGlobalRef(local_range_result_summary_class); @@ -1705,6 +2028,12 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { if (range_result_class != JNI_NULL) { env->DeleteGlobalRef(range_result_class); } + if (keyrange_array_result_class != JNI_NULL) { + env->DeleteGlobalRef(keyrange_array_result_class); + } + if (keyrange_class != JNI_NULL) { + env->DeleteGlobalRef(keyrange_class); + } if (mapped_range_result_class != JNI_NULL) { env->DeleteGlobalRef(mapped_range_result_class); } diff --git a/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java index 5087361c43..80afa6c761 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java @@ -40,6 +40,8 @@ import org.junit.jupiter.api.Assertions; * This test is to verify the atomicity of transactions. */ public class CycleMultiClientIntegrationTest { + public static final int API_VERSION = 720; + public static final MultiClientHelper clientHelper = new MultiClientHelper(); // more write txn than validate txn, as parent thread waits only for validate txn. @@ -51,7 +53,7 @@ public class CycleMultiClientIntegrationTest { private static List expected = new ArrayList<>(Arrays.asList("0", "1", "2", "3")); public static void main(String[] args) throws Exception { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); setupThreads(fdb); Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us System.out.println("Starting tests"); diff --git a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java index 59fbc3fe55..b91c9e7de3 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java @@ -40,7 +40,8 @@ import org.junit.jupiter.api.extension.ExtendWith; */ @ExtendWith(RequiresDatabase.class) class DirectoryTest { - private static final FDB fdb = FDB.selectAPIVersion(720); + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); @Test void testCanCreateDirectory() throws Exception { diff --git a/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java index 063e9e276d..3aedef4d1e 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java @@ -41,7 +41,8 @@ import org.junit.jupiter.api.extension.ExtendWith; @ExtendWith(RequiresDatabase.class) class MappedRangeQueryIntegrationTest { - private static final FDB fdb = FDB.selectAPIVersion(720); + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); public String databaseArg = null; private Database openFDB() { return fdb.open(databaseArg); } @@ -110,7 +111,7 @@ class MappedRangeQueryIntegrationTest { boolean validate = true; @Test void comparePerformance() { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try (Database db = openFDB()) { insertRecordsWithIndexes(numRecords, db); instrument(rangeQueryAndThenRangeQueries, "rangeQueryAndThenRangeQueries", db); diff --git a/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java index bc64877199..fb6d3afd9f 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java @@ -41,7 +41,8 @@ import org.junit.jupiter.api.extension.ExtendWith; */ @ExtendWith(RequiresDatabase.class) class RangeQueryIntegrationTest { - private static final FDB fdb = FDB.selectAPIVersion(720); + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); @BeforeEach @AfterEach diff --git a/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java b/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java index c11940d41a..ab8ab1364a 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java @@ -41,6 +41,8 @@ import org.junit.jupiter.api.Assertions; * are still seeting the initialValue even after new transactions set them to a new value. */ public class RepeatableReadMultiThreadClientTest { + public static final int API_VERSION = 720; + public static final MultiClientHelper clientHelper = new MultiClientHelper(); private static final int oldValueReadCount = 30; @@ -52,7 +54,7 @@ public class RepeatableReadMultiThreadClientTest { private static final Map threadToOldValueReaders = new HashMap<>(); public static void main(String[] args) throws Exception { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); setupThreads(fdb); Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us System.out.println("Starting tests"); diff --git a/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java b/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java index 785008902c..ead1f499c1 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java +++ b/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java @@ -47,6 +47,7 @@ import org.opentest4j.TestAbortedException; * be running a server and you don't want to deal with spurious test failures. */ public class RequiresDatabase implements ExecutionCondition, BeforeAllCallback { + public static final int API_VERSION = 720; public static boolean canRunIntegrationTest() { String prop = System.getProperty("run.integration.tests"); @@ -80,7 +81,7 @@ public class RequiresDatabase implements ExecutionCondition, BeforeAllCallback { * assume that if we are here, then canRunIntegrationTest() is returning true and we don't have to bother * checking it. */ - try (Database db = FDB.selectAPIVersion(720).open()) { + try (Database db = FDB.selectAPIVersion(API_VERSION).open()) { db.run(tr -> { CompletableFuture future = tr.get("test".getBytes()); diff --git a/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java b/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java index 4a4736d566..30f86632eb 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java @@ -19,6 +19,8 @@ import org.junit.jupiter.api.Assertions; * This test is to verify the causal consistency of transactions for mutli-threaded client. */ public class SidebandMultiThreadClientTest { + public static final int API_VERSION = 720; + public static final MultiClientHelper clientHelper = new MultiClientHelper(); private static final Map> db2Queues = new HashMap<>(); @@ -26,7 +28,7 @@ public class SidebandMultiThreadClientTest { private static final int txnCnt = 1000; public static void main(String[] args) throws Exception { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); setupThreads(fdb); Collection dbs = clientHelper.openDatabases(fdb); // the clientHelper will close the databases for us for (Database db : dbs) { diff --git a/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java b/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java index 6e53b179e5..fc54f6c333 100644 --- a/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java +++ b/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java @@ -29,6 +29,8 @@ import org.junit.jupiter.api.extension.ExtensionContext; * are not available for any reason. */ public class FDBLibraryRule implements BeforeAllCallback { + public static final int CURRENT_API_VERSION = 720; + private final int apiVersion; // because FDB is a singleton (currently), this isn't a super-useful cache, @@ -37,7 +39,7 @@ public class FDBLibraryRule implements BeforeAllCallback { public FDBLibraryRule(int apiVersion) { this.apiVersion = apiVersion; } - public static FDBLibraryRule current() { return new FDBLibraryRule(720); } + public static FDBLibraryRule current() { return new FDBLibraryRule(CURRENT_API_VERSION); } public static FDBLibraryRule v63() { return new FDBLibraryRule(630); } diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java index 8606d7ec39..5100dec392 100644 --- a/bindings/java/src/main/com/apple/foundationdb/Database.java +++ b/bindings/java/src/main/com/apple/foundationdb/Database.java @@ -161,6 +161,172 @@ public interface Database extends AutoCloseable, TransactionContext { */ double getMainThreadBusyness(); + /** + * Runs {@link #purgeBlobGranules(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param force if true delete all data, if not keep data >= purgeVersion + * + * @return the key to watch for purge complete + */ + default CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, boolean force) { + return purgeBlobGranules(beginKey, endKey, -2, force, getExecutor()); + } + + /** + * Runs {@link #purgeBlobGranules(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param purgeVersion version to purge at + * @param force if true delete all data, if not keep data >= purgeVersion + * + * @return the key to watch for purge complete + */ + default CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force) { + return purgeBlobGranules(beginKey, endKey, purgeVersion, force, getExecutor()); + } + + /** + * Queues a purge of blob granules for the specified key range, at the specified version. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param purgeVersion version to purge at + * @param force if true delete all data, if not keep data >= purgeVersion + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return the key to watch for purge complete + */ + CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e); + + + /** + * Runs {@link #waitPurgeGranulesComplete(Function)} on the default executor. + * + * @param purgeKey key to watch + */ + default CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey) { + return waitPurgeGranulesComplete(purgeKey, getExecutor()); + } + + /** + * Wait for a previous call to purgeBlobGranules to complete. + * + * @param purgeKey key to watch + * @param e the {@link Executor} to use for asynchronous callbacks + */ + CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey, Executor e); + + /** + * Runs {@link #blobbifyRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + + * @return if the recording of the range was successful + */ + default CompletableFuture blobbifyRange(byte[] beginKey, byte[] endKey) { + return blobbifyRange(beginKey, endKey, getExecutor()); + } + + /** + * Sets a range to be blobbified in the database. Must be a completely unblobbified range. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return if the recording of the range was successful + */ + CompletableFuture blobbifyRange(byte[] beginKey, byte[] endKey, Executor e); + + /** + * Runs {@link #unblobbifyRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + + * @return if the recording of the range was successful + */ + default CompletableFuture unblobbifyRange(byte[] beginKey, byte[] endKey) { + return unblobbifyRange(beginKey, endKey, getExecutor()); + } + + /** + * Unsets a blobbified range in the database. The range must be aligned to known blob ranges. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return if the recording of the range was successful + */ + CompletableFuture unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e); + + /** + * Runs {@link #listBlobbifiedRanges(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param rangeLimit batch size + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)] + */ + default CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit) { + return listBlobbifiedRanges(beginKey, endKey, rangeLimit, getExecutor()); + } + + /** + * Lists blobbified ranges in the database. There may be more if result.size() == rangeLimit. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param rangeLimit batch size + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)] + */ + CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e); + + /** + * Runs {@link #verifyBlobRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * + * @return a future with the version of the last blob granule. + */ + default CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey) { + return verifyBlobRange(beginKey, endKey, -2, getExecutor()); + } + + /** + * Runs {@link #verifyBlobRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param version version to read at + * + * @return a future with the version of the last blob granule. + */ + default CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey, long version) { + return verifyBlobRange(beginKey, endKey, version, getExecutor()); + } + + /** + * Checks if a blob range is blobbified. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param version version to read at + * + * @return a future with the version of the last blob granule. + */ + CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e); + /** * Runs a read-only transactional function against this {@code Database} with retry logic. * {@link Function#apply(Object) apply(ReadTransaction)} will be called on the diff --git a/bindings/java/src/main/com/apple/foundationdb/FDB.java b/bindings/java/src/main/com/apple/foundationdb/FDB.java index 5215d0836e..47ba2eead1 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDB.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java @@ -191,11 +191,6 @@ public class FDB { Select_API_version(version); singleton = new FDB(version); - if (version < 720) { - TenantManagement.TENANT_MAP_PREFIX = ByteArrayUtil.join(new byte[] { (byte)255, (byte)255 }, - "/management/tenant_map/".getBytes()); - } - return singleton; } diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java index 5e0b808242..98c001a1b0 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java @@ -200,6 +200,66 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume } } + @Override + public CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e) { + pointerReadLock.lock(); + try { + return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), e, eventKeeper); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey, Executor e) { + pointerReadLock.lock(); + try { + return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture blobbifyRange(byte[] beginKey, byte[] endKey, Executor e) { + pointerReadLock.lock(); + try { + return new FutureBool(Database_blobbifyRange(getPtr(), beginKey, endKey), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e) { + pointerReadLock.lock(); + try { + return new FutureBool(Database_unblobbifyRange(getPtr(), beginKey, endKey), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e) { + pointerReadLock.lock(); + try { + return new FutureKeyRangeArray(Database_listBlobbifiedRanges(getPtr(), beginKey, endKey, rangeLimit), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e) { + pointerReadLock.lock(); + try { + return new FutureInt64(Database_verifyBlobRange(getPtr(), beginKey, endKey, version), e); + } finally { + pointerReadLock.unlock(); + } + } + @Override public Executor getExecutor() { return executor; @@ -215,4 +275,10 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume private native void Database_dispose(long cPtr); private native void Database_setOption(long cPtr, int code, byte[] value) throws FDBException; private native double Database_getMainThreadBusyness(long cPtr); + private native long Database_purgeBlobGranules(long cPtr, byte[] beginKey, byte[] endKey, long purgeVersion, boolean force); + private native long Database_waitPurgeGranulesComplete(long cPtr, byte[] purgeKey); + private native long Database_blobbifyRange(long cPtr, byte[] beginKey, byte[] endKey); + private native long Database_unblobbifyRange(long cPtr, byte[] beginKey, byte[] endKey); + private native long Database_listBlobbifiedRanges(long cPtr, byte[] beginKey, byte[] endKey, int rangeLimit); + private native long Database_verifyBlobRange(long cPtr, byte[] beginKey, byte[] endKey, long version); } \ No newline at end of file diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java index b35196c146..7943c5e9d1 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java @@ -97,6 +97,11 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC return FDBTransaction.this.getRangeSplitPoints(range, chunkSize); } + @Override + public CompletableFuture getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) { + return FDBTransaction.this.getBlobGranuleRanges(begin, end, rowLimit); + } + @Override public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, int matchIndex, boolean reverse, @@ -352,6 +357,16 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC return this.getRangeSplitPoints(range.begin, range.end, chunkSize); } + @Override + public CompletableFuture getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) { + pointerReadLock.lock(); + try { + return new FutureKeyRangeArray(Transaction_getBlobGranuleRanges(getPtr(), begin, end, rowLimit), executor); + } finally { + pointerReadLock.unlock(); + } + } + @Override public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, int matchIndex, boolean reverse, StreamingMode mode) { @@ -842,4 +857,5 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC private native long Transaction_getKeyLocations(long cPtr, byte[] key); private native long Transaction_getEstimatedRangeSizeBytes(long cPtr, byte[] keyBegin, byte[] keyEnd); private native long Transaction_getRangeSplitPoints(long cPtr, byte[] keyBegin, byte[] keyEnd, long chunkSize); + private native long Transaction_getBlobGranuleRanges(long cPtr, byte[] keyBegin, byte[] keyEnd, int rowLimit); } diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureBool.java b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java new file mode 100644 index 0000000000..ddbbd02649 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java @@ -0,0 +1,37 @@ +/* + * FutureBool.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.util.concurrent.Executor; + +class FutureBool extends NativeFuture { + FutureBool(long cPtr, Executor executor) { + super(cPtr); + registerMarshalCallback(executor); + } + + @Override + protected Boolean getIfDone_internal(long cPtr) throws FDBException { + return FutureBool_get(cPtr); + } + + private native boolean FutureBool_get(long cPtr) throws FDBException; +} diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java new file mode 100644 index 0000000000..d866e9fca4 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java @@ -0,0 +1,37 @@ +/* + * FutureKeyRangeArray.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.util.concurrent.Executor; + +class FutureKeyRangeArray extends NativeFuture { + FutureKeyRangeArray(long cPtr, Executor executor) { + super(cPtr); + registerMarshalCallback(executor); + } + + @Override + protected KeyRangeArrayResult getIfDone_internal(long cPtr) throws FDBException { + return FutureKeyRangeArray_get(cPtr); + } + + private native KeyRangeArrayResult FutureKeyRangeArray_get(long cPtr) throws FDBException; +} diff --git a/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java new file mode 100644 index 0000000000..7385b8fe0a --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java @@ -0,0 +1,36 @@ +/* + * KeyRangeArrayResult.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.util.Arrays; +import java.util.List; + +public class KeyRangeArrayResult { + final List keyRanges; + + public KeyRangeArrayResult(Range[] keyRangeArr) { + this.keyRanges = Arrays.asList(keyRangeArr); + } + + public List getKeyRanges() { + return keyRanges; + } +} diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index 11ed7e900c..04050de6fb 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -513,6 +513,17 @@ public interface ReadTransaction extends ReadTransactionContext { */ CompletableFuture getRangeSplitPoints(Range range, long chunkSize); + /** + * Gets the blob granule ranges for a given region. + * Returned in batches, requires calling again moving the begin key up. + * + * @param begin beginning of the range (inclusive) + * @param end end of the range (exclusive) + + * @return list of blob granules in the given range. May not be all. + */ + CompletableFuture getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit); + /** * Returns a set of options that can be set on a {@code Transaction} diff --git a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java index 12aaf70322..58f223fa4b 100644 --- a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java +++ b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java @@ -262,7 +262,7 @@ public class TenantManagement { this.begin = ByteArrayUtil.join(TENANT_MAP_PREFIX, begin); this.end = ByteArrayUtil.join(TENANT_MAP_PREFIX, end); - tr.options().setReadSystemKeys(); + tr.options().setRawAccess(); tr.options().setLockAware(); firstGet = tr.getRange(this.begin, this.end, limit); diff --git a/bindings/java/src/main/overview.html.in b/bindings/java/src/main/overview.html.in index 3154efbfc3..a37a8859f9 100644 --- a/bindings/java/src/main/overview.html.in +++ b/bindings/java/src/main/overview.html.in @@ -28,8 +28,10 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.tuple.Tuple; public class Example { + public static final int apiVersion = 720; + public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(apiVersion); try(Database db = fdb.open()) { // Run an operation on the database diff --git a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java index a1d7a4d976..425c5d2369 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java @@ -29,11 +29,13 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.Transaction; public class BlockingBenchmark { + public static final int API_VERSION = 720; + private static final int REPS = 100000; private static final int PARALLEL = 100; public static void main(String[] args) throws InterruptedException { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); // The cluster file DOES NOT need to be valid, although it must exist. // This is because the database is never really contacted in this test. diff --git a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java index 38be19a60f..0eabef64c0 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java @@ -30,6 +30,8 @@ import com.apple.foundationdb.Database; import com.apple.foundationdb.FDB; public class ConcurrentGetSetGet { + public static final int API_VERSION = 720; + public static final Charset UTF8 = Charset.forName("UTF-8"); final Semaphore semaphore = new Semaphore(CONCURRENCY); @@ -48,7 +50,7 @@ public class ConcurrentGetSetGet { } public static void main(String[] args) { - try(Database database = FDB.selectAPIVersion(720).open()) { + try(Database database = FDB.selectAPIVersion(API_VERSION).open()) { new ConcurrentGetSetGet().apply(database); } } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Context.java b/bindings/java/src/test/com/apple/foundationdb/test/Context.java index a594e088a1..dbbe7e73de 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/Context.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/Context.java @@ -29,6 +29,7 @@ import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; import com.apple.foundationdb.Database; import com.apple.foundationdb.FDB; @@ -64,7 +65,7 @@ abstract class Context implements Runnable, AutoCloseable { private List children = new LinkedList<>(); private static Map transactionMap = new HashMap<>(); private static Map transactionRefCounts = new HashMap<>(); - private static Map tenantMap = new HashMap<>(); + private static Map tenantMap = new ConcurrentHashMap<>(); Context(Database db, byte[] prefix) { this.db = db; @@ -83,8 +84,8 @@ abstract class Context implements Runnable, AutoCloseable { try { executeOperations(); } catch(Throwable t) { - // EAT t.printStackTrace(); + System.exit(1); } while(children.size() > 0) { //System.out.println("Shutting down...waiting on " + children.size() + " threads"); @@ -146,10 +147,11 @@ abstract class Context implements Runnable, AutoCloseable { private static synchronized boolean newTransaction(Database db, Optional tenant, String trName, boolean allowReplace) { TransactionState oldState = transactionMap.get(trName); if (oldState != null) { - releaseTransaction(oldState.transaction); - } - else if (!allowReplace) { - return false; + if (allowReplace) { + releaseTransaction(oldState.transaction); + } else { + return false; + } } TransactionState newState = new TransactionState(createTransaction(db, tenant), tenant); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Example.java b/bindings/java/src/test/com/apple/foundationdb/test/Example.java index da5bbfdc2a..d7f1336d51 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/Example.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/Example.java @@ -25,8 +25,10 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.tuple.Tuple; public class Example { + public static final int API_VERSION = 720; + public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { // Run an operation on the database diff --git a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java index 35adfa5e1f..2af2152cf3 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java @@ -28,10 +28,12 @@ import com.apple.foundationdb.KeyValue; import com.apple.foundationdb.TransactionContext; public class IterableTest { + public static final int API_VERSION = 720; + public static void main(String[] args) throws InterruptedException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { runTests(reps, db); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java index 6410165f27..969b6c75e3 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java @@ -32,9 +32,10 @@ import com.apple.foundationdb.async.AsyncUtil; import com.apple.foundationdb.tuple.ByteArrayUtil; public class LocalityTests { + public static final int API_VERSION = 720; public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database database = fdb.open(args[0])) { try(Transaction tr = database.createTransaction()) { String[] keyAddresses = LocalityUtil.getAddressesForKey(tr, "a".getBytes()).join(); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java index 6518116324..e5e0c9f9e6 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java @@ -36,6 +36,8 @@ import com.apple.foundationdb.async.AsyncIterator; import com.apple.foundationdb.tuple.ByteArrayUtil; public class ParallelRandomScan { + public static final int API_VERSION = 720; + private static final int ROWS = 1000000; private static final int DURATION_MS = 2000; private static final int PARALLELISM_MIN = 10; @@ -43,7 +45,7 @@ public class ParallelRandomScan { private static final int PARALLELISM_STEP = 5; public static void main(String[] args) throws InterruptedException { - FDB api = FDB.selectAPIVersion(720); + FDB api = FDB.selectAPIVersion(API_VERSION); try(Database database = api.open(args[0])) { for(int i = PARALLELISM_MIN; i <= PARALLELISM_MAX; i += PARALLELISM_STEP) { runTest(database, i, ROWS, DURATION_MS); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java index d847556cdc..e11f8b9793 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java @@ -29,12 +29,14 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.Transaction; public class SerialInsertion { + public static final int API_VERSION = 720; + private static final int THREAD_COUNT = 10; private static final int BATCH_SIZE = 1000; private static final int NODES = 1000000; public static void main(String[] args) { - FDB api = FDB.selectAPIVersion(720); + FDB api = FDB.selectAPIVersion(API_VERSION); try(Database database = api.open()) { long start = System.currentTimeMillis(); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java index 6e262e561f..f55af41c35 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java @@ -34,12 +34,14 @@ import com.apple.foundationdb.Transaction; import com.apple.foundationdb.async.AsyncIterable; public class SerialIteration { + public static final int API_VERSION = 720; + private static final int ROWS = 1000000; private static final int RUNS = 25; private static final int THREAD_COUNT = 1; public static void main(String[] args) throws InterruptedException { - FDB api = FDB.selectAPIVersion(720); + FDB api = FDB.selectAPIVersion(API_VERSION); try(Database database = api.open(args[0])) { for(int i = 1; i <= THREAD_COUNT; i++) { runThreadedTest(database, i); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java index 9313543d02..ea3210e2de 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java @@ -27,10 +27,12 @@ import com.apple.foundationdb.FDB; import com.apple.foundationdb.TransactionContext; public class SerialTest { + public static final int API_VERSION = 720; + public static void main(String[] args) throws InterruptedException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { runTests(reps, db); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java index 1f3aec5501..6fdee20cad 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java @@ -35,11 +35,13 @@ import com.apple.foundationdb.tuple.Tuple; * Some tests regarding conflict ranges to make sure they do what we expect. */ public class SnapshotTransactionTest { + public static final int API_VERSION = 720; + private static final int CONFLICT_CODE = 1020; private static final Subspace SUBSPACE = new Subspace(Tuple.from("test", "conflict_ranges")); public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { snapshotReadShouldNotConflict(db); snapshotShouldNotAddConflictRange(db); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java index 4fa45f7cbe..2ce8e76343 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java @@ -32,12 +32,14 @@ import com.apple.foundationdb.tuple.Tuple; import com.apple.foundationdb.tuple.Versionstamp; public class TupleTest { + public static final int API_VERSION = 720; + private static final byte FF = (byte)0xff; public static void main(String[] args) throws NoSuchFieldException { final int reps = 1000; try { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { runTests(reps, db); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java index b39744dd32..421db6c542 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java @@ -31,8 +31,10 @@ import com.apple.foundationdb.tuple.Tuple; import com.apple.foundationdb.tuple.Versionstamp; public class VersionstampSmokeTest { + public static final int API_VERSION = 720; + public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database db = fdb.open()) { db.run(tr -> { tr.clear(Tuple.from("prefix").range()); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java index 29e05db04e..ef8db81fd4 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java @@ -32,9 +32,10 @@ import com.apple.foundationdb.FDBException; import com.apple.foundationdb.Transaction; public class WatchTest { + public static final int API_VERSION = 720; public static void main(String[] args) { - FDB fdb = FDB.selectAPIVersion(720); + FDB fdb = FDB.selectAPIVersion(API_VERSION); try(Database database = fdb.open(args[0])) { database.options().setLocationCacheSize(42); try(Transaction tr = database.createTransaction()) { diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 0f871d8c87..af281a7405 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -68,45 +68,10 @@ endif() set(setup_file_name foundationdb-${FDB_VERSION}.tar.gz) set(package_file ${CMAKE_BINARY_DIR}/packages/foundationdb-${FDB_VERSION}${not_fdb_release_string}.tar.gz) add_custom_command(OUTPUT ${package_file} - COMMAND $ setup.py sdist --formats=gztar && + COMMAND $ setup.py sdist --formats=gztar && ${CMAKE_COMMAND} -E copy dist/${setup_file_name} ${package_file} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Create Python sdist package") add_custom_target(python_package DEPENDS ${package_file}) add_dependencies(python_package python_binding) add_dependencies(packages python_package) - -if (NOT WIN32 AND NOT OPEN_FOR_IDE) - add_fdbclient_test( - NAME single_process_fdbcli_tests - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - ) - add_fdbclient_test( - NAME multi_process_fdbcli_tests - PROCESS_NUMBER 5 - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - 5 - ) - if (TARGET external_client) # external_client copies fdb_c to bindings/c/libfdb_c_external.so - add_fdbclient_test( - NAME single_process_external_client_fdbcli_tests - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so - ) - add_fdbclient_test( - NAME multi_process_external_client_fdbcli_tests - PROCESS_NUMBER 5 - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - 5 - --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so - ) - endif() -endif() diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py index e7d1a8bc30..930ad35396 100644 --- a/bindings/python/fdb/__init__.py +++ b/bindings/python/fdb/__init__.py @@ -100,10 +100,8 @@ def api_version(ver): _add_symbols(fdb.impl, list) - if ver >= 710: + if ver >= 630: import fdb.tenant_management - if ver < 720: - fdb.tenant_management._tenant_map_prefix = b'\xff\xff/management/tenant_map/' if ver < 610: globals()["init"] = getattr(fdb.impl, "init") diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 51d67e5162..aa967ba25d 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -1359,7 +1359,7 @@ else: except: # The system python on OS X can't find the library installed to /usr/local/lib if SIP is enabled # find_library does find the location in /usr/local/lib, so if the above fails fallback to using it - lib_path = ctypes.util.find_library(capi_name) + lib_path = ctypes.util.find_library("fdb_c") if lib_path is not None: try: _capi = ctypes.CDLL(lib_path) diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index 84c3a46d03..ebe36594a5 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -103,7 +103,7 @@ class FDBTenantList(object): # JSON strings of the tenant metadata @_impl.transactional def _list_tenants_impl(tr, begin, end, limit): - tr.options.set_read_system_keys() + tr.options.set_raw_access() begin_key = b'%s%s' % (_tenant_map_prefix, begin) end_key = b'%s%s' % (_tenant_map_prefix, end) diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py index cd27f985b0..b94d7ea8e4 100644 --- a/bindings/python/tests/size_limit_tests.py +++ b/bindings/python/tests/size_limit_tests.py @@ -66,6 +66,9 @@ def test_size_limit_option(db): except fdb.FDBError as e: assert(e.code == 2101) # Transaction exceeds byte limit (2101) + # Reset the size limit for future tests + db.options.set_transaction_size_limit(10000000) + @fdb.transactional def test_get_approximate_size(tr): tr[b'key1'] = b'value1' diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index c0440df5ef..786126359b 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -125,7 +125,7 @@ function(add_fdb_test) list(TRANSFORM ADD_FDB_TEST_TEST_FILES PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/") if (ENABLE_SIMULATION_TESTS) add_test(NAME ${test_name} - COMMAND $ ${TestRunner} + COMMAND $ ${TestRunner} -n ${test_name} -b ${PROJECT_BINARY_DIR} -t ${test_type} @@ -142,7 +142,7 @@ function(add_fdb_test) ${VALGRIND_OPTION} ${ADD_FDB_TEST_TEST_FILES} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - set_tests_properties("${test_name}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) + set_tests_properties("${test_name}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") get_filename_component(test_dir_full ${first_file} DIRECTORY) if(NOT ${test_dir_full} STREQUAL "") get_filename_component(test_dir ${test_dir_full} NAME) @@ -172,8 +172,7 @@ function(stage_correctness_package) file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin) string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) foreach(test IN LISTS TEST_NAMES) - if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND - (${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND + if((${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE})) foreach(file IN LISTS TEST_FILES_${test}) string(SUBSTRING ${file} ${base_length} -1 rel_out_file) @@ -199,16 +198,17 @@ function(stage_correctness_package) set(src_dir "${src_dir}/") string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir) string(SUBSTRING ${file} ${dir_len} -1 rel_out_file) - set(out_file ${STAGE_OUT_DIR}/${rel_out_file}) + set(out_file ${STAGE_OUT_DIR}/${rel_out_file}) list(APPEND external_files ${out_file}) - add_custom_command( + add_custom_command( OUTPUT ${out_file} - DEPENDS ${file} - COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file} - COMMENT "Copying ${STAGE_CONTEXT} external file ${file}" - ) + DEPENDS ${file} + COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file} + COMMENT "Copying ${STAGE_CONTEXT} external file ${file}" + ) endforeach() endforeach() + list(APPEND package_files ${STAGE_OUT_DIR}/bin/fdbserver ${STAGE_OUT_DIR}/bin/coverage.fdbserver.xml ${STAGE_OUT_DIR}/bin/coverage.fdbclient.xml @@ -218,6 +218,7 @@ function(stage_correctness_package) ${STAGE_OUT_DIR}/bin/TraceLogHelper.dll ${STAGE_OUT_DIR}/CMakeCache.txt ) + add_custom_command( OUTPUT ${package_files} DEPENDS ${CMAKE_BINARY_DIR}/CMakeCache.txt @@ -239,6 +240,20 @@ function(stage_correctness_package) ${STAGE_OUT_DIR}/bin COMMENT "Copying files for ${STAGE_CONTEXT} package" ) + + set(test_harness_dir "${CMAKE_SOURCE_DIR}/contrib/TestHarness2") + file(GLOB_RECURSE test_harness2_files RELATIVE "${test_harness_dir}" CONFIGURE_DEPENDS "${test_harness_dir}/*.py") + foreach(file IN LISTS test_harness2_files) + set(src_file "${test_harness_dir}/${file}") + set(out_file "${STAGE_OUT_DIR}/${file}") + get_filename_component(dir "${out_file}" DIRECTORY) + file(MAKE_DIRECTORY "${dir}") + add_custom_command(OUTPUT ${out_file} + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${out_file}" + DEPENDS "${src_file}") + list(APPEND package_files "${out_file}") + endforeach() + list(APPEND package_files ${test_files} ${external_files}) if(STAGE_OUT_FILES) set(${STAGE_OUT_FILES} ${package_files} PARENT_SCOPE) @@ -404,7 +419,7 @@ endfunction() # Creates a single cluster before running the specified command (usually a ctest test) function(add_fdbclient_test) - set(options DISABLED ENABLED DISABLE_LOG_DUMP API_TEST_BLOB_GRANULES_ENABLED TLS_ENABLED) + set(options DISABLED ENABLED DISABLE_TENANTS DISABLE_LOG_DUMP API_TEST_BLOB_GRANULES_ENABLED TLS_ENABLED) set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY) set(multiValueArgs COMMAND) cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") @@ -431,6 +446,9 @@ function(add_fdbclient_test) if(T_DISABLE_LOG_DUMP) list(APPEND TMP_CLUSTER_CMD --disable-log-dump) endif() + if(T_DISABLE_TENANTS) + list(APPEND TMP_CLUSTER_CMD --disable-tenants) + endif() if(T_API_TEST_BLOB_GRANULES_ENABLED) list(APPEND TMP_CLUSTER_CMD --blob-granules-enabled) endif() @@ -440,16 +458,20 @@ function(add_fdbclient_test) message(STATUS "Adding Client test ${T_NAME}") add_test(NAME "${T_NAME}" WORKING_DIRECTORY ${T_WORKING_DIRECTORY} - COMMAND ${Python_EXECUTABLE} ${TMP_CLUSTER_CMD} + COMMAND ${Python3_EXECUTABLE} ${TMP_CLUSTER_CMD} -- ${T_COMMAND}) if (T_TEST_TIMEOUT) set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT}) else() # default timeout - set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + if(USE_SANITIZER) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 1200) + else() + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + endif() endif() - set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) + set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") endfunction() # Creates a cluster file for a nonexistent cluster before running the specified command @@ -473,7 +495,7 @@ function(add_unavailable_fdbclient_test) endif() message(STATUS "Adding unavailable client test ${T_NAME}") add_test(NAME "${T_NAME}" - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/fake_cluster.py + COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/fake_cluster.py --output-dir ${CMAKE_BINARY_DIR} -- ${T_COMMAND}) @@ -483,7 +505,7 @@ function(add_unavailable_fdbclient_test) # default timeout set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 60) endif() - set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) + set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") endfunction() # Creates 3 distinct clusters before running the specified command. @@ -508,7 +530,7 @@ function(add_multi_fdbclient_test) endif() message(STATUS "Adding Client test ${T_NAME}") add_test(NAME "${T_NAME}" - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_multi_cluster.py + COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_multi_cluster.py --build-dir ${CMAKE_BINARY_DIR} --clusters 3 -- diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake index 9952746972..47d3fb2996 100644 --- a/cmake/CompileBoost.cmake +++ b/cmake/CompileBoost.cmake @@ -36,9 +36,12 @@ function(compile_boost) set(B2_COMMAND "./b2") set(BOOST_COMPILER_FLAGS -fvisibility=hidden -fPIC -std=c++17 -w) set(BOOST_LINK_FLAGS "") - if(APPLE OR CLANG OR ICX OR USE_LIBCXX) + if(APPLE OR ICX OR USE_LIBCXX) list(APPEND BOOST_COMPILER_FLAGS -stdlib=libc++ -nostdlib++) - list(APPEND BOOST_LINK_FLAGS -static-libgcc -lc++ -lc++abi) + list(APPEND BOOST_LINK_FLAGS -lc++ -lc++abi) + if (NOT APPLE) + list(APPEND BOOST_LINK_FLAGS -static-libgcc) + endif() endif() # Update the user-config.jam @@ -46,27 +49,35 @@ function(compile_boost) foreach(flag IN LISTS BOOST_COMPILER_FLAGS COMPILE_BOOST_CXXFLAGS) string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "${flag} ") endforeach() - #foreach(flag IN LISTS BOOST_LINK_FLAGS COMPILE_BOOST_LDFLAGS) - # string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "${flag} ") - #endforeach() + foreach(flag IN LISTS BOOST_LINK_FLAGS COMPILE_BOOST_LDFLAGS) + string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "${flag} ") + endforeach() configure_file(${CMAKE_SOURCE_DIR}/cmake/user-config.jam.cmake ${CMAKE_BINARY_DIR}/user-config.jam) set(USER_CONFIG_FLAG --user-config=${CMAKE_BINARY_DIR}/user-config.jam) # Build boost include(ExternalProject) + set(BOOST_INSTALL_DIR "${CMAKE_BINARY_DIR}/boost_install") ExternalProject_add("${COMPILE_BOOST_TARGET}Project" - URL "https://boostorg.jfrog.io/artifactory/main/release/1.78.0/source/boost_1_78_0.tar.bz2" - URL_HASH SHA256=8681f175d4bdb26c52222665793eef08490d7758529330f98d3b29dd0735bccc - CONFIGURE_COMMAND ${BOOTSTRAP_COMMAND} ${BOOTSTRAP_ARGS} --with-libraries=${BOOTSTRAP_LIBRARIES} --with-toolset=${BOOST_TOOLSET} - BUILD_COMMAND ${B2_COMMAND} link=static ${COMPILE_BOOST_BUILD_ARGS} --prefix=${BOOST_INSTALL_DIR} ${USER_CONFIG_FLAG} install - BUILD_IN_SOURCE ON - INSTALL_COMMAND "" - UPDATE_COMMAND "" - BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp" - "${BOOST_INSTALL_DIR}/lib/libboost_context.a" - "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a" - "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a") + URL "https://boostorg.jfrog.io/artifactory/main/release/1.78.0/source/boost_1_78_0.tar.bz2" + URL_HASH SHA256=8681f175d4bdb26c52222665793eef08490d7758529330f98d3b29dd0735bccc + CONFIGURE_COMMAND ${BOOTSTRAP_COMMAND} + ${BOOTSTRAP_ARGS} + --with-libraries=${BOOTSTRAP_LIBRARIES} + --with-toolset=${BOOST_TOOLSET} + BUILD_COMMAND ${B2_COMMAND} + link=static + ${COMPILE_BOOST_BUILD_ARGS} + --prefix=${BOOST_INSTALL_DIR} + ${USER_CONFIG_FLAG} install + BUILD_IN_SOURCE ON + INSTALL_COMMAND "" + UPDATE_COMMAND "" + BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp" + "${BOOST_INSTALL_DIR}/lib/libboost_context.a" + "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a" + "${BOOST_INSTALL_DIR}/lib/libboost_iostreams.a") add_library(${COMPILE_BOOST_TARGET}_context STATIC IMPORTED) add_dependencies(${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}Project) @@ -92,10 +103,10 @@ if(USE_SANITIZER) endif() message(STATUS "A sanitizer is enabled, need to build boost from source") if (USE_VALGRIND) - compile_boost(TARGET boost_asan BUILD_ARGS valgrind=on + compile_boost(TARGET boost_target BUILD_ARGS valgrind=on CXXFLAGS ${SANITIZER_COMPILE_OPTIONS} LDFLAGS ${SANITIZER_LINK_OPTIONS}) else() - compile_boost(TARGET boost_asan BUILD_ARGS context-impl=ucontext + compile_boost(TARGET boost_target BUILD_ARGS context-impl=ucontext CXXFLAGS ${SANITIZER_COMPILE_OPTIONS} LDFLAGS ${SANITIZER_LINK_OPTIONS}) endif() return() @@ -130,7 +141,7 @@ if(WIN32) return() endif() -find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem CONFIG PATHS ${BOOST_HINT_PATHS}) +find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem iostreams CONFIG PATHS ${BOOST_HINT_PATHS}) set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores any installed boost") if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND Boost_iostreams_FOUND AND NOT FORCE_BOOST_BUILD) diff --git a/cmake/CompileRocksDB.cmake b/cmake/CompileRocksDB.cmake index 4634e57e7c..3fdea389ab 100644 --- a/cmake/CompileRocksDB.cmake +++ b/cmake/CompileRocksDB.cmake @@ -4,30 +4,42 @@ find_package(RocksDB 6.27.3) include(ExternalProject) -if (RocksDB_FOUND) +set(RocksDB_CMAKE_ARGS + -DUSE_RTTI=1 + -DPORTABLE=${PORTABLE_ROCKSDB} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS} + -DCMAKE_STATIC_LINKER_FLAGS=${CMAKE_STATIC_LINKER_FLAGS} + -DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DFAIL_ON_WARNINGS=OFF + -DWITH_GFLAGS=OFF + -DWITH_TESTS=OFF + -DWITH_TOOLS=OFF + -DWITH_CORE_TOOLS=OFF + -DWITH_BENCHMARK_TOOLS=OFF + -DWITH_BZ2=OFF + -DWITH_LZ4=ON + -DWITH_SNAPPY=OFF + -DWITH_ZLIB=OFF + -DWITH_ZSTD=OFF + -DWITH_LIBURING=${WITH_LIBURING} + -DWITH_TSAN=${USE_TSAN} + -DWITH_ASAN=${USE_ASAN} + -DWITH_UBSAN=${USE_UBSAN} + -DROCKSDB_BUILD_SHARED=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=True +) + +if(ROCKSDB_FOUND) ExternalProject_Add(rocksdb SOURCE_DIR "${RocksDB_ROOT}" DOWNLOAD_COMMAND "" - CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB} - -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_TOOLS=OFF - -DWITH_CORE_TOOLS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_BZ2=OFF - -DWITH_LZ4=ON - -DWITH_SNAPPY=OFF - -DWITH_ZLIB=OFF - -DWITH_ZSTD=OFF - -DWITH_LIBURING=${WITH_LIBURING} - -DWITH_TSAN=${USE_TSAN} - -DWITH_ASAN=${USE_ASAN} - -DWITH_UBSAN=${USE_UBSAN} - -DROCKSDB_BUILD_SHARED=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=True + CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" ) @@ -37,28 +49,9 @@ if (RocksDB_FOUND) ${BINARY_DIR}/librocksdb.a) else() ExternalProject_Add(rocksdb - URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz - URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 - CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB} - -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_TOOLS=OFF - -DWITH_CORE_TOOLS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_BZ2=OFF - -DWITH_LZ4=ON - -DWITH_SNAPPY=OFF - -DWITH_ZLIB=OFF - -DWITH_ZSTD=OFF - -DWITH_LIBURING=${WITH_LIBURING} - -DWITH_TSAN=${USE_TSAN} - -DWITH_ASAN=${USE_ASAN} - -DWITH_UBSAN=${USE_UBSAN} - -DROCKSDB_BUILD_SHARED=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=True + URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz + URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 + CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" ) @@ -68,7 +61,7 @@ else() ${BINARY_DIR}/librocksdb.a) ExternalProject_Get_Property(rocksdb SOURCE_DIR) - set (ROCKSDB_INCLUDE_DIR "${SOURCE_DIR}/include") + set(ROCKSDB_INCLUDE_DIR "${SOURCE_DIR}/include") set(ROCKSDB_FOUND TRUE) endif() diff --git a/cmake/CompileZstd.cmake b/cmake/CompileZstd.cmake new file mode 100644 index 0000000000..968d09398f --- /dev/null +++ b/cmake/CompileZstd.cmake @@ -0,0 +1,23 @@ +# Compile zstd + +function(compile_zstd) + + include(FetchContent) + + set(ZSTD_SOURCE_DIR ${CMAKE_BINARY_DIR}/zstd) + + FetchContent_Declare( + ZSTD + GIT_REPOSITORY https://github.com/facebook/zstd.git + GIT_TAG v1.5.2 + SOURCE_DIR ${ZSTD_SOURCE_DIR} + BINARY_DIR ${ZSTD_SOURCE_DIR} + SOURCE_SUBDIR "build/cmake" + ) + + FetchContent_MakeAvailable(ZSTD) + + add_library(ZSTD::ZSTD STATIC IMPORTED) + set_target_properties(ZSTD::ZSTD PROPERTIES IMPORTED_LOCATION "${CMAKE_BINARY_DIR}/lib/libzstd.a") + target_include_directories(ZSTD::ZSTD PUBLIC ${ZSTD_INCLUDE_DIRS}) +endfunction(compile_zstd) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 2deaa4d0bc..e38f333b58 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -25,6 +25,7 @@ env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstd env_set(TRACE_PC_GUARD_INSTRUMENTATION_LIB "" STRING "Path to a library containing an implementation for __sanitizer_cov_trace_pc_guard. See https://clang.llvm.org/docs/SanitizerCoverage.html for more info.") env_set(PROFILE_INSTR_GENERATE OFF BOOL "If set, build FDB as an instrumentation build to generate profiles") env_set(PROFILE_INSTR_USE "" STRING "If set, build FDB with profile") +env_set(FULL_DEBUG_SYMBOLS OFF BOOL "Generate full debug symbols") set(USE_SANITIZER OFF) if(USE_ASAN OR USE_VALGRIND OR USE_MSAN OR USE_TSAN OR USE_UBSAN) @@ -68,6 +69,8 @@ if(WIN32) add_definitions(-DBOOST_USE_WINDOWS_H) add_definitions(-DWIN32_LEAN_AND_MEAN) add_definitions(-D_ITERATOR_DEBUG_LEVEL=0) + add_definitions(-DNOGDI) # WinGDI.h defines macro ERROR + add_definitions(-D_USE_MATH_DEFINES) # Math constants endif() if (USE_CCACHE) @@ -162,9 +165,20 @@ else() set(SANITIZER_COMPILE_OPTIONS) set(SANITIZER_LINK_OPTIONS) - # we always compile with debug symbols. CPack will strip them out + # we always compile with debug symbols. For release builds CPack will strip them out # and create a debuginfo rpm - add_compile_options(-ggdb -fno-omit-frame-pointer) + add_compile_options(-fno-omit-frame-pointer -gz) + add_link_options(-gz) + if(FDB_RELEASE OR FULL_DEBUG_SYMBOLS OR CMAKE_BUILD_TYPE STREQUAL "Debug") + # Configure with FULL_DEBUG_SYMBOLS=ON to generate all symbols for debugging with gdb + # Also generating full debug symbols in release builds, because they are packaged + # separately and installed optionally + add_compile_options(-ggdb) + else() + # Generating minimal debug symbols by default. They are sufficient for testing purposes + add_compile_options(-ggdb1) + endif() + if(TRACE_PC_GUARD_INSTRUMENTATION_LIB) add_compile_options(-fsanitize-coverage=trace-pc-guard) link_libraries(${TRACE_PC_GUARD_INSTRUMENTATION_LIB}) @@ -190,6 +204,7 @@ else() endif() if(USE_GCOV) + add_compile_options(--coverage) add_link_options(--coverage) endif() @@ -198,6 +213,8 @@ else() -fsanitize=undefined # TODO(atn34) Re-enable -fsanitize=alignment once https://github.com/apple/foundationdb/issues/1434 is resolved -fno-sanitize=alignment + # https://github.com/apple/foundationdb/issues/7955 + -fno-sanitize=function -DBOOST_USE_UCONTEXT) list(APPEND SANITIZER_LINK_OPTIONS -fsanitize=undefined) endif() @@ -275,16 +292,35 @@ else() #add_compile_options(-fno-builtin-memcpy) if (CLANG OR ICX) - add_compile_options() if (APPLE OR USE_LIBCXX) - add_compile_options($<$:-stdlib=libc++>) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") if (NOT APPLE) if (STATIC_LINK_LIBCXX) - add_link_options(-static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic") + else() + # Make sure that libc++ can be found be the platform's loader, so that thing's like cmake's "try_run" work. + find_library(LIBCXX_SO_PATH c++ /usr/local/lib) + if (LIBCXX_SO_PATH) + get_filename_component(LIBCXX_SO_DIR ${LIBCXX_SO_PATH} DIRECTORY) + if (APPLE) + set(ENV{DYLD_LIBRARY_PATH} "$ENV{DYLD_LIBRARY_PATH}:${LIBCXX_SO_DIR}") + elseif(WIN32) + set(ENV{PATH} "$ENV{PATH};${LIBCXX_SO_DIR}") + else() + set(ENV{LD_LIBRARY_PATH} "$ENV{LD_LIBRARY_PATH}:${LIBCXX_SO_DIR}") + endif() + endif() endif() - add_link_options(-stdlib=libc++ -Wl,-build-id=sha1) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -Wl,-build-id=sha1") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++ -Wl,-build-id=sha1") endif() endif() + if (NOT APPLE AND NOT USE_LIBCXX) + message(STATUS "Linking libatomic") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -latomic") + endif() if (OPEN_FOR_IDE) add_compile_options( -Wno-unknown-attributes) @@ -302,11 +338,19 @@ else() -Wno-unknown-warning-option -Wno-unused-parameter -Wno-constant-logical-operand + # These need to be disabled for FDB's RocksDB storage server implementation + -Wno-deprecated-copy + -Wno-delete-non-abstract-non-virtual-dtor + -Wno-range-loop-construct + -Wno-reorder-ctor + # Needed for clang 13 (todo: Update above logic so that it figures out when to pass in -static-libstdc++ and when it will be ignored) + # When you remove this, you might need to move it back to the USE_CCACHE stanza. It was (only) there before I moved it here. + -Wno-unused-command-line-argument ) if (USE_CCACHE) add_compile_options( -Wno-register - -Wno-unused-command-line-argument) + ) endif() if (PROFILE_INSTR_GENERATE) add_compile_options(-fprofile-instr-generate) diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index 13e3f790a8..292d58c454 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -56,8 +56,8 @@ endif() # Python Bindings ################################################################################ -find_package(Python COMPONENTS Interpreter) -if(Python_Interpreter_FOUND) +find_package(Python3 COMPONENTS Interpreter) +if(Python3_Interpreter_FOUND) set(WITH_PYTHON ON) else() message(WARNING "Could not found a suitable python interpreter") @@ -178,7 +178,7 @@ set(PORTABLE_ROCKSDB ON CACHE BOOL "Compile RocksDB in portable mode") # Set thi set(WITH_LIBURING OFF CACHE BOOL "Build with liburing enabled") # Set this to ON to include liburing # RocksDB is currently enabled by default for GCC but does not build with the latest # Clang. -if (SSD_ROCKSDB_EXPERIMENTAL AND GCC) +if (SSD_ROCKSDB_EXPERIMENTAL AND NOT WIN32) set(WITH_ROCKSDB_EXPERIMENTAL ON) else() set(WITH_ROCKSDB_EXPERIMENTAL OFF) @@ -200,6 +200,9 @@ else() URL "https://github.com/ToruNiina/toml11/archive/v3.4.0.tar.gz" URL_HASH SHA256=bc6d733efd9216af8c119d8ac64a805578c79cc82b813e4d1d880ca128bd154d CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER} -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}/toml11 -Dtoml11_BUILD_TEST:BOOL=OFF BUILD_ALWAYS ON) @@ -229,7 +232,12 @@ set(COROUTINE_IMPL ${DEFAULT_COROUTINE_IMPL} CACHE STRING "Which coroutine imple set(BUILD_AWS_BACKUP OFF CACHE BOOL "Build AWS S3 SDK backup client") if (BUILD_AWS_BACKUP) - set(WITH_AWS_BACKUP ON) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(WITH_AWS_BACKUP ON) + else() + message(WARNING "BUILD_AWS_BACKUP set but ignored ${CMAKE_SYSTEM_PROCESSOR} is not supported yet") + set(WITH_AWS_BACKUP OFF) + endif() else() set(WITH_AWS_BACKUP OFF) endif() diff --git a/cmake/FlowCommands.cmake b/cmake/FlowCommands.cmake index 4d7fa4f2c2..81ea36892d 100644 --- a/cmake/FlowCommands.cmake +++ b/cmake/FlowCommands.cmake @@ -9,6 +9,14 @@ define_property(TARGET PROPERTY COVERAGE_FILTERS expression in this list will be ignored when the coverage.target.xml file is \ generated. This property is set through the add_flow_target function.") +if(WIN32) + set(compilation_unit_macro_default OFF) +else() + set(compilation_unit_macro_default ON) +endif() + +set(PASS_COMPILATION_UNIT "${compilation_unit_macro_default}" CACHE BOOL + "Pass path to compilation unit as macro to each compilation unit (useful for code probes)") function(generate_coverage_xml) if(NOT (${ARGC} EQUAL "1")) @@ -259,6 +267,11 @@ function(add_flow_target) endif() endif() endforeach() + if(PASS_COMPILATION_UNIT) + foreach(s IN LISTS sources) + set_source_files_properties("${s}" PROPERTIES COMPILE_DEFINITIONS "COMPILATION_UNIT=${s}") + endforeach() + endif() if(AFT_EXECUTABLE) set(strip_target ON) set(target_type exec) diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index bfdd2f5898..8d04ebccca 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -14,7 +14,7 @@ ExternalProject_add(Jemalloc_project BUILD_BYPRODUCTS "${JEMALLOC_DIR}/include/jemalloc/jemalloc.h" "${JEMALLOC_DIR}/lib/libjemalloc.a" "${JEMALLOC_DIR}/lib/libjemalloc_pic.a" - CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof + CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof BUILD_IN_SOURCE ON BUILD_COMMAND make INSTALL_DIR "${JEMALLOC_DIR}" @@ -24,4 +24,4 @@ add_dependencies(im_jemalloc_pic Jemalloc_project) set_target_properties(im_jemalloc_pic PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc_pic.a") set_target_properties(im_jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc.a") target_include_directories(jemalloc INTERFACE "${JEMALLOC_DIR}/include") -target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) \ No newline at end of file +target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) diff --git a/cmake/awssdk.cmake b/cmake/awssdk.cmake index 88cb7c78e9..ab62f9b6d6 100644 --- a/cmake/awssdk.cmake +++ b/cmake/awssdk.cmake @@ -2,16 +2,14 @@ project(awssdk-download NONE) # Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang set(AWSSDK_COMPILER_FLAGS "") -set(AWSSDK_LINK_FLAGS "") -if(APPLE OR CLANG OR USE_LIBCXX) - set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++) - set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi) +if(APPLE OR USE_LIBCXX) + set(AWSSDK_COMPILER_FLAGS "-stdlib=libc++ -nostdlib++") endif() include(ExternalProject) ExternalProject_Add(awssdk_project GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git - GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200 + GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331 SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build" GIT_CONFIG advice.detachedHead=false @@ -21,11 +19,11 @@ ExternalProject_Add(awssdk_project -DSIMPLE_INSTALL=ON -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path -DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own - + -DBUILD_CURL=ON + -DBUILD_ZLIB=ON -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS} - -DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS} + -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS} TEST_COMMAND "" # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a" @@ -35,11 +33,14 @@ ExternalProject_Add(awssdk_project "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a" ) add_library(awssdk_core STATIC IMPORTED) @@ -75,6 +76,10 @@ add_library(awssdk_c_io STATIC IMPORTED) add_dependencies(awssdk_c_io awssdk_project) set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a") +add_library(awssdk_c_sdkutils STATIC IMPORTED) +add_dependencies(awssdk_c_sdkutils awssdk_project) +set_target_properties(awssdk_c_sdkutils PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a") + add_library(awssdk_checksums STATIC IMPORTED) add_dependencies(awssdk_checksums awssdk_project) set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a") @@ -91,7 +96,15 @@ add_library(awssdk_c_common STATIC IMPORTED) add_dependencies(awssdk_c_common awssdk_project) set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a") +add_library(curl STATIC IMPORTED) +add_dependencies(curl awssdk_project) +set_property(TARGET curl PROPERTY IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a") + +add_library(zlib STATIC IMPORTED) +add_dependencies(zlib awssdk_project) +set_property(TARGET zlib PROPERTY IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a") + # link them all together in one interface target add_library(awssdk_target INTERFACE) target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include) -target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl) \ No newline at end of file +target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_sdkutils awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl zlib) diff --git a/cmake/user-config.jam.cmake b/cmake/user-config.jam.cmake index 6d2883cc95..696bcdd831 100644 --- a/cmake/user-config.jam.cmake +++ b/cmake/user-config.jam.cmake @@ -1 +1,2 @@ using @BOOST_TOOLSET@ : : @BOOST_CXX_COMPILER@ : @BOOST_ADDITIONAL_COMPILE_OPTIONS@ ; +using zstd : 1.5.2 : /@CMAKE_BINARY_DIR@/zstd/lib /@CMAKE_BINARY_DIR@/lib ; diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index cce49cf76c..75ca06243f 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -8,6 +8,8 @@ add_subdirectory(rapidxml) add_subdirectory(sqlite) add_subdirectory(SimpleOpt) add_subdirectory(fmt-8.1.1) +add_subdirectory(md5) +add_subdirectory(libb64) if(NOT WIN32) add_subdirectory(linenoise) add_subdirectory(debug_determinism) diff --git a/contrib/Implib.so/arch/common/init.c.tpl b/contrib/Implib.so/arch/common/init.c.tpl index a71f8df44f..d8b9b3bbde 100644 --- a/contrib/Implib.so/arch/common/init.c.tpl +++ b/contrib/Implib.so/arch/common/init.c.tpl @@ -26,8 +26,7 @@ extern "C" { #define CHECK(cond, fmt, ...) do { \ if(!(cond)) { \ fprintf(stderr, "implib-gen: $load_name: " fmt "\n", ##__VA_ARGS__); \ - assert(0 && "Assertion in generated code"); \ - exit(1); \ + abort(); \ } \ } while(0) diff --git a/contrib/Joshua/scripts/bindingTestScript.sh b/contrib/Joshua/scripts/bindingTestScript.sh index f4e0e8eb8b..2d6badbe9d 100755 --- a/contrib/Joshua/scripts/bindingTestScript.sh +++ b/contrib/Joshua/scripts/bindingTestScript.sh @@ -83,6 +83,7 @@ fi # Stop the cluster if stopCluster; then unset FDBSERVERID + trap - EXIT fi exit "${status}" diff --git a/contrib/Joshua/scripts/correctnessTest.sh b/contrib/Joshua/scripts/correctnessTest.sh index a617d81088..bee09acf25 100755 --- a/contrib/Joshua/scripts/correctnessTest.sh +++ b/contrib/Joshua/scripts/correctnessTest.sh @@ -4,4 +4,6 @@ export ASAN_OPTIONS="detect_leaks=0" OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" -mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false +#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false + +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} diff --git a/contrib/Joshua/scripts/correctnessTimeout.sh b/contrib/Joshua/scripts/correctnessTimeout.sh index 7917aae591..6bd0bfeee0 100755 --- a/contrib/Joshua/scripts/correctnessTimeout.sh +++ b/contrib/Joshua/scripts/correctnessTimeout.sh @@ -1,4 +1,4 @@ #!/bin/bash -u -for file in `find . -name 'trace*.xml'` ; do - mono ./bin/TestHarness.exe summarize "${file}" summary.xml "" JoshuaTimeout true -done + + +python3 -m test_harness.timeout diff --git a/contrib/Joshua/scripts/localClusterStart.sh b/contrib/Joshua/scripts/localClusterStart.sh index abbf93abc5..500e106339 100644 --- a/contrib/Joshua/scripts/localClusterStart.sh +++ b/contrib/Joshua/scripts/localClusterStart.sh @@ -210,7 +210,7 @@ function stopCluster then # Ensure that process is dead if ! kill -0 "${FDBSERVERID}" 2> /dev/null; then - log "Killed cluster (${FDBSERVERID}) via cli" + log "Killed cluster (${FDBSERVERID}) via cli" "${DEBUGLEVEL}" elif ! kill -9 "${FDBSERVERID}"; then log "Failed to kill FDB Server process (${FDBSERVERID}) via cli or kill command" let status="${status} + 1" diff --git a/contrib/Joshua/scripts/valgrindTest.sh b/contrib/Joshua/scripts/valgrindTest.sh index 5409429691..820750f3b2 100755 --- a/contrib/Joshua/scripts/valgrindTest.sh +++ b/contrib/Joshua/scripts/valgrindTest.sh @@ -1,3 +1,3 @@ #!/bin/sh OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" -mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" true +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --use-valgrind diff --git a/contrib/Joshua/scripts/valgrindTimeout.sh b/contrib/Joshua/scripts/valgrindTimeout.sh index b9d9e7ebad..2224598e43 100755 --- a/contrib/Joshua/scripts/valgrindTimeout.sh +++ b/contrib/Joshua/scripts/valgrindTimeout.sh @@ -1,6 +1,2 @@ #!/bin/bash -u -for file in `find . -name 'trace*.xml'` ; do - for valgrindFile in `find . -name 'valgrind*.xml'` ; do - mono ./bin/TestHarness.exe summarize "${file}" summary.xml "${valgrindFile}" JoshuaTimeout true - done -done +python3 -m test_harness.timeout --use-valgrind diff --git a/contrib/TestHarness/Program.cs b/contrib/TestHarness/Program.cs index 0b2bbc1127..b3e003dee5 100644 --- a/contrib/TestHarness/Program.cs +++ b/contrib/TestHarness/Program.cs @@ -19,6 +19,7 @@ */ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; @@ -302,6 +303,7 @@ namespace SummarizeTest uniqueFileSet.Add(file.Substring(0, file.LastIndexOf("-"))); // all restarting tests end with -1.txt or -2.txt } uniqueFiles = uniqueFileSet.ToArray(); + Array.Sort(uniqueFiles); testFile = random.Choice(uniqueFiles); // The on-disk format changed in 4.0.0, and 5.x can't load files from 3.x. string oldBinaryVersionLowerBound = "4.0.0"; @@ -334,8 +336,9 @@ namespace SummarizeTest // thus, by definition, if "until_" appears, we do not want to run with the current binary version oldBinaries = oldBinaries.Concat(currentBinary); } - List oldBinariesList = oldBinaries.ToList(); - if (oldBinariesList.Count == 0) { + string[] oldBinariesList = oldBinaries.ToArray(); + Array.Sort(oldBinariesList); + if (oldBinariesList.Count() == 0) { // In theory, restarting tests are named to have at least one old binary version to run // But if none of the provided old binaries fall in the range, we just skip the test Console.WriteLine("No available old binary version from {0} to {1}", oldBinaryVersionLowerBound, oldBinaryVersionUpperBound); @@ -347,6 +350,7 @@ namespace SummarizeTest else { uniqueFiles = Directory.GetFiles(testDir); + Array.Sort(uniqueFiles); testFile = random.Choice(uniqueFiles); } } @@ -487,6 +491,16 @@ namespace SummarizeTest useValgrind ? "on" : "off"); } + IDictionary data = Environment.GetEnvironmentVariables(); + foreach (DictionaryEntry i in data) + { + string k=(string)i.Key; + string v=(string)i.Value; + if (k.StartsWith("FDB_KNOB")) { + process.StartInfo.EnvironmentVariables[k]=v; + } + } + process.Start(); // SOMEDAY: Do we want to actually do anything with standard output or error? @@ -718,7 +732,7 @@ namespace SummarizeTest process.Refresh(); if (process.HasExited) return; - long mem = process.PrivateMemorySize64; + long mem = process.PagedMemorySize64; MaxMem = Math.Max(MaxMem, mem); //Console.WriteLine(string.Format("Process used {0} bytes", MaxMem)); Thread.Sleep(1000); @@ -744,16 +758,28 @@ namespace SummarizeTest AppendToSummary(summaryFileName, xout); } - // Parses the valgrind XML file and returns a list of "what" tags for each error. + static string ParseValgrindStack(XElement stackElement) { + string backtrace = ""; + foreach (XElement frame in stackElement.Elements()) { + backtrace += " " + frame.Element("ip").Value.ToLower(); + } + if (backtrace.Length > 0) { + backtrace = "addr2line -e fdbserver.debug -p -C -f -i" + backtrace; + } + + return backtrace; + } + + // Parses the valgrind XML file and returns a list of error elements. // All errors for which the "kind" tag starts with "Leak" are ignored - static string[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout) + static XElement[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout) { if (!traceToStdout) { Console.WriteLine("Reading vXML file: " + valgrindOutputFileName); } - ISet whats = new HashSet(); + IList errors = new List(); XElement xdoc = XDocument.Load(valgrindOutputFileName).Element("valgrindoutput"); foreach(var elem in xdoc.Elements()) { if (elem.Name != "error") @@ -761,9 +787,29 @@ namespace SummarizeTest string kind = elem.Element("kind").Value; if(kind.StartsWith("Leak")) continue; - whats.Add(elem.Element("what").Value); + + XElement errorElement = new XElement("ValgrindError", + new XAttribute("Severity", (int)Magnesium.Severity.SevError)); + + int num = 1; + string suffix = ""; + foreach (XElement sub in elem.Elements()) { + if (sub.Name == "what") { + errorElement.SetAttributeValue("What", sub.Value); + } else if (sub.Name == "auxwhat") { + suffix = "Aux" + num++; + errorElement.SetAttributeValue("What" + suffix, sub.Value); + } else if (sub.Name == "stack") { + errorElement.SetAttributeValue("Backtrace" + suffix, ParseValgrindStack(sub)); + } else if (sub.Name == "origin") { + errorElement.SetAttributeValue("WhatOrigin", sub.Element("what").Value); + errorElement.SetAttributeValue("BacktraceOrigin", ParseValgrindStack(sub.Element("stack"))); + } + } + + errors.Add(errorElement); } - return whats.ToArray(); + return errors.ToArray(); } delegate IEnumerable parseDelegate(System.IO.Stream stream, string file, @@ -788,8 +834,15 @@ namespace SummarizeTest string firstRetryableError = ""; int stderrSeverity = (int)Magnesium.Severity.SevError; + xout.Add(new XAttribute("DeterminismCheck", expectedUnseed != -1 ? "1" : "0")); + xout.Add(new XAttribute("OldBinary", Path.GetFileName(oldBinaryName))); + + if (traceFiles.Length == 0) { + xout.Add(new XElement("NoTraceFilesFound")); + } + Dictionary, Magnesium.Severity> severityMap = new Dictionary, Magnesium.Severity>(); - Dictionary, bool> codeCoverage = new Dictionary, bool>(); + var codeCoverage = new Dictionary, bool>(); foreach (var traceFileName in traceFiles) { @@ -824,9 +877,7 @@ namespace SummarizeTest new XAttribute("RandomSeed", ev.Details.RandomSeed), new XAttribute("SourceVersion", ev.Details.SourceVersion), new XAttribute("Time", ev.Details.ActualTime), - new XAttribute("BuggifyEnabled", ev.Details.BuggifyEnabled), - new XAttribute("DeterminismCheck", expectedUnseed != -1 ? "1" : "0"), - new XAttribute("OldBinary", Path.GetFileName(oldBinaryName))); + new XAttribute("BuggifyEnabled", ev.Details.BuggifyEnabled)); testBeginFound = true; if (ev.DDetails.ContainsKey("FaultInjectionEnabled")) xout.Add(new XAttribute("FaultInjectionEnabled", ev.Details.FaultInjectionEnabled)); @@ -902,12 +953,17 @@ namespace SummarizeTest if (ev.Type == "CodeCoverage" && !willRestart) { bool covered = true; - if(ev.DDetails.ContainsKey("Covered")) + if (ev.DDetails.ContainsKey("Covered")) { covered = int.Parse(ev.Details.Covered) != 0; } - var key = new Tuple(ev.Details.File, ev.Details.Line); + var comment = ""; + if (ev.DDetails.ContainsKey("Comment")) + { + comment = ev.Details.Comment; + } + var key = new Tuple(ev.Details.File, ev.Details.Line, comment); if (covered || !codeCoverage.ContainsKey(key)) { codeCoverage[key] = covered; @@ -917,6 +973,10 @@ namespace SummarizeTest { xout.Add(new XElement(ev.Type, new XAttribute("File", ev.Details.File), new XAttribute("Line", ev.Details.Line))); } + if (ev.Type == "RunningUnitTest") + { + xout.Add(new XElement(ev.Type, new XAttribute("Name", ev.Details.Name), new XAttribute("File", ev.Details.File), new XAttribute("Line", ev.Details.Line))); + } if (ev.Type == "TestsExpectedToPass") testCount = int.Parse(ev.Details.Count); if (ev.Type == "TestResults" && ev.Details.Passed == "1") @@ -954,6 +1014,12 @@ namespace SummarizeTest xout.Add(new XElement(externalError, new XAttribute("Severity", (int)Magnesium.Severity.SevError))); } + string joshuaSeed = System.Environment.GetEnvironmentVariable("JOSHUA_SEED"); + + if (joshuaSeed != null) { + xout.Add(new XAttribute("JoshuaSeed", joshuaSeed)); + } + foreach(var kv in codeCoverage) { var element = new XElement("CodeCoverage", new XAttribute("File", kv.Key.Item1), new XAttribute("Line", kv.Key.Item2)); @@ -961,6 +1027,9 @@ namespace SummarizeTest { element.Add(new XAttribute("Covered", "0")); } + if (kv.Key.Item3.Length > 0) { + element.Add(new XAttribute("Comment", kv.Key.Item3)); + } xout.Add(element); } @@ -1046,12 +1115,10 @@ namespace SummarizeTest try { // If there are any errors reported "ok" will be set to false - var whats = ParseValgrindOutput(valgrindOutputFileName, traceToStdout); - foreach (var what in whats) + var valgrindErrors = ParseValgrindOutput(valgrindOutputFileName, traceToStdout); + foreach (var vError in valgrindErrors) { - xout.Add(new XElement("ValgrindError", - new XAttribute("Severity", (int)Magnesium.Severity.SevError), - new XAttribute("What", what))); + xout.Add(vError); ok = false; error = true; } diff --git a/contrib/TestHarness2/.gitignore b/contrib/TestHarness2/.gitignore new file mode 100644 index 0000000000..80682f9552 --- /dev/null +++ b/contrib/TestHarness2/.gitignore @@ -0,0 +1,2 @@ +/tmp/ +/venv diff --git a/contrib/TestHarness2/test_harness/__init__.py b/contrib/TestHarness2/test_harness/__init__.py new file mode 100644 index 0000000000..3cb95520ec --- /dev/null +++ b/contrib/TestHarness2/test_harness/__init__.py @@ -0,0 +1,2 @@ +# Currently this file is left intentionally empty. It's main job for now is to indicate that this directory +# should be used as a module. diff --git a/contrib/TestHarness2/test_harness/app.py b/contrib/TestHarness2/test_harness/app.py new file mode 100644 index 0000000000..3e300c6bf4 --- /dev/null +++ b/contrib/TestHarness2/test_harness/app.py @@ -0,0 +1,25 @@ +import argparse +import sys +import traceback + +from test_harness.config import config +from test_harness.run import TestRunner +from test_harness.summarize import SummaryTree + +if __name__ == '__main__': + try: + parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + test_runner = TestRunner() + if not test_runner.run(): + exit(1) + except Exception as e: + _, _, exc_traceback = sys.exc_info() + error = SummaryTree('TestHarnessError') + error.attributes['Severity'] = '40' + error.attributes['ErrorMessage'] = str(e) + error.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + error.dump(sys.stdout) + exit(1) diff --git a/contrib/TestHarness2/test_harness/config.py b/contrib/TestHarness2/test_harness/config.py new file mode 100644 index 0000000000..191fab629d --- /dev/null +++ b/contrib/TestHarness2/test_harness/config.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +import argparse +import collections +import copy +import os +import random +from enum import Enum +from pathlib import Path +from typing import List, Any, OrderedDict, Dict + + +class BuggifyOptionValue(Enum): + ON = 1 + OFF = 2 + RANDOM = 3 + + +class BuggifyOption: + def __init__(self, val: str | None = None): + self.value = BuggifyOptionValue.RANDOM + if val is not None: + v = val.lower() + if v in ['on', '1', 'true']: + self.value = BuggifyOptionValue.ON + elif v in ['off', '0', 'false']: + self.value = BuggifyOptionValue.OFF + elif v in ['random', 'rnd', 'r']: + pass + else: + assert False, 'Invalid value {} -- use true, false, or random'.format(v) + + +class ConfigValue: + def __init__(self, name: str, **kwargs): + self.name = name + self.value = None + self.kwargs = kwargs + if 'default' in self.kwargs: + self.value = self.kwargs['default'] + + def get_arg_name(self) -> str: + if 'long_name' in self.kwargs: + return self.kwargs['long_name'] + else: + return self.name + + def add_to_args(self, parser: argparse.ArgumentParser): + kwargs = copy.copy(self.kwargs) + long_name = self.name + short_name = None + if 'long_name' in kwargs: + long_name = kwargs['long_name'] + del kwargs['long_name'] + if 'short_name' in kwargs: + short_name = kwargs['short_name'] + del kwargs['short_name'] + if 'action' in kwargs and kwargs['action'] in ['store_true', 'store_false']: + del kwargs['type'] + long_name = long_name.replace('_', '-') + if short_name is None: + # line below is useful for debugging + # print('add_argument(\'--{}\', [{{{}}}])'.format(long_name, ', '.join(['\'{}\': \'{}\''.format(k, v) + # for k, v in kwargs.items()]))) + parser.add_argument('--{}'.format(long_name), **kwargs) + else: + # line below is useful for debugging + # print('add_argument(\'-{}\', \'--{}\', [{{{}}}])'.format(short_name, long_name, + # ', '.join(['\'{}\': \'{}\''.format(k, v) + # for k, v in kwargs.items()]))) + parser.add_argument('-{}'.format(short_name), '--{}'.format(long_name), **kwargs) + + def get_value(self, args: argparse.Namespace) -> tuple[str, Any]: + return self.name, args.__getattribute__(self.get_arg_name()) + + +class Config: + """ + This is the central configuration class for test harness. The values in this class are exposed globally through + a global variable test_harness.config.config. This class provides some "magic" to keep test harness flexible. + Each parameter can further be configured using an `_args` member variable which is expected to be a dictionary. + * The value of any variable can be set through the command line. For a variable named `variable_name` we will + by default create a new command line option `--variable-name` (`_` is automatically changed to `-`). This + default can be changed by setting the `'long_name'` property in the `_arg` dict. + * In addition the user can also optionally set a short-name. This can be achieved by setting the `'short_name'` + property in the `_arg` dictionary. + * All additional properties in `_args` are passed to `argparse.add_argument`. + * If the default of a variable is `None` the user should explicitly set the `'type'` property to an appropriate + type. + * In addition to command line flags, all configuration options can also be controlled through environment variables. + By default, `variable-name` can be changed by setting the environment variable `TH_VARIABLE_NAME`. This default + can be changed by setting the `'env_name'` property. + * Test harness comes with multiple executables. Each of these should use the config facility. For this, + `Config.build_arguments` should be called first with the `argparse` parser. Then `Config.extract_args` needs + to be called with the result of `argparse.ArgumentParser.parse_args`. A sample example could look like this: + ``` + parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + ``` + * Changing the default value for all executables might not always be desirable. If it should be only changed for + one executable Config.change_default should be used. + """ + def __init__(self): + self.random = random.Random() + self.cluster_file: str | None = None + self.cluster_file_args = {'short_name': 'C', 'type': str, 'help': 'Path to fdb cluster file', 'required': False, + 'env_name': 'JOSHUA_CLUSTER_FILE'} + self.joshua_dir: str | None = None + self.joshua_dir_args = {'type': str, 'help': 'Where to write FDB data to', 'required': False, + 'env_name': 'JOSHUA_APP_DIR'} + self.stats: str | None = None + self.stats_args = {'type': str, 'help': 'A base64 encoded list of statistics (used to reproduce runs)', + 'required': False} + self.random_seed: int | None = None + self.random_seed_args = {'type': int, + 'help': 'Force given seed given to fdbserver -- mostly useful for debugging', + 'required': False} + self.kill_seconds: int = 30 * 60 + self.kill_seconds_args = {'help': 'Timeout for individual test'} + self.buggify_on_ratio: float = 0.8 + self.buggify_on_ratio_args = {'help': 'Probability that buggify is turned on'} + self.write_run_times = False + self.write_run_times_args = {'help': 'Write back probabilities after each test run', + 'action': 'store_true'} + self.unseed_check_ratio: float = 0.05 + self.unseed_check_ratio_args = {'help': 'Probability for doing determinism check'} + self.test_dirs: List[str] = ['slow', 'fast', 'restarting', 'rare', 'noSim'] + self.test_dirs_args: dict = {'nargs': '*', 'help': 'test_directories to look for files in'} + self.trace_format: str = 'json' + self.trace_format_args = {'choices': ['json', 'xml'], 'help': 'What format fdb should produce'} + self.crash_on_error: bool = True + self.crash_on_error_args = {'long_name': 'no_crash', 'action': 'store_false', + 'help': 'Don\'t crash on first error'} + self.max_warnings: int = 10 + self.max_warnings_args = {'short_name': 'W'} + self.max_errors: int = 10 + self.max_errors_args = {'short_name': 'E'} + self.old_binaries_path: Path = Path('/app/deploy/global_data/oldBinaries/') + self.old_binaries_path_args = {'help': 'Path to the directory containing the old fdb binaries'} + self.tls_plugin_path: Path = Path('/app/deploy/runtime/.tls_5_1/FDBLibTLS.so') + self.tls_plugin_path_args = {'help': 'Path to the tls plugin used for binaries < 5.2.0'} + self.disable_kaio: bool = False + self.use_valgrind: bool = False + self.use_valgrind_args = {'action': 'store_true'} + self.buggify = BuggifyOption('random') + self.buggify_args = {'short_name': 'b', 'choices': ['on', 'off', 'random']} + self.pretty_print: bool = False + self.pretty_print_args = {'short_name': 'P', 'action': 'store_true'} + self.clean_up: bool = True + self.clean_up_args = {'long_name': 'no_clean_up', 'action': 'store_false'} + self.run_dir: Path = Path('tmp') + self.joshua_seed: int = random.randint(0, 2 ** 32 - 1) + self.joshua_seed_args = {'short_name': 's', 'help': 'A random seed', 'env_name': 'JOSHUA_SEED'} + self.print_coverage = False + self.print_coverage_args = {'action': 'store_true'} + self.binary = Path('bin') / ('fdbserver.exe' if os.name == 'nt' else 'fdbserver') + self.binary_args = {'help': 'Path to executable'} + self.hit_per_runs_ratio: int = 20000 + self.hit_per_runs_ratio_args = {'help': 'Maximum test runs before each code probe hit at least once'} + self.output_format: str = 'xml' + self.output_format_args = {'short_name': 'O', 'choices': ['json', 'xml'], + 'help': 'What format TestHarness should produce'} + self.include_test_files: str = r'.*' + self.include_test_files_args = {'help': 'Only consider test files whose path match against the given regex'} + self.exclude_test_files: str = r'.^' + self.exclude_test_files_args = {'help': 'Don\'t consider test files whose path match against the given regex'} + self.include_test_classes: str = r'.*' + self.include_test_classes_args = {'help': 'Only consider tests whose names match against the given regex'} + self.exclude_test_names: str = r'.^' + self.exclude_test_names_args = {'help': 'Don\'t consider tests whose names match against the given regex'} + self.details: bool = False + self.details_args = {'help': 'Print detailed results', 'short_name': 'c', 'action': 'store_true'} + self.success: bool = False + self.success_args = {'help': 'Print successful results', 'action': 'store_true'} + self.cov_include_files: str = r'.*' + self.cov_include_files_args = {'help': 'Only consider coverage traces that originated in files matching regex'} + self.cov_exclude_files: str = r'.^' + self.cov_exclude_files_args = {'help': 'Ignore coverage traces that originated in files matching regex'} + self.max_stderr_bytes: int = 10000 + self.write_stats: bool = True + self.read_stats: bool = True + self.reproduce_prefix: str | None = None + self.reproduce_prefix_args = {'type': str, 'required': False, + 'help': 'When printing the results, prepend this string to the command'} + self._env_names: Dict[str, str] = {} + self._config_map = self._build_map() + self._read_env() + self.random.seed(self.joshua_seed, version=2) + + def change_default(self, attr: str, default_val): + assert attr in self._config_map, 'Unknown config attribute {}'.format(attr) + self.__setattr__(attr, default_val) + self._config_map[attr].kwargs['default'] = default_val + + def _get_env_name(self, var_name: str) -> str: + return self._env_names.get(var_name, 'TH_{}'.format(var_name.upper())) + + def dump(self): + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or callable(obj) or attr.endswith('_args'): + continue + print('config.{}: {} = {}'.format(attr, type(obj), obj)) + + def _build_map(self) -> OrderedDict[str, ConfigValue]: + config_map: OrderedDict[str, ConfigValue] = collections.OrderedDict() + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or callable(obj): + continue + if attr.endswith('_args'): + name = attr[0:-len('_args')] + assert name in config_map + assert isinstance(obj, dict) + for k, v in obj.items(): + if k == 'env_name': + self._env_names[name] = v + else: + config_map[name].kwargs[k] = v + else: + # attribute_args has to be declared after the attribute + assert attr not in config_map + val_type = type(obj) + kwargs = {'type': val_type, 'default': obj} + config_map[attr] = ConfigValue(attr, **kwargs) + return config_map + + def _read_env(self): + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or attr.endswith('_args') or callable(obj): + continue + env_name = self._get_env_name(attr) + attr_type = self._config_map[attr].kwargs['type'] + assert type(None) != attr_type + e = os.getenv(env_name) + if e is not None: + # Use the env var to supply the default value, so that if the + # environment variable is set and the corresponding command line + # flag is not, the environment variable has an effect. + self._config_map[attr].kwargs['default'] = attr_type(e) + + def build_arguments(self, parser: argparse.ArgumentParser): + for val in self._config_map.values(): + val.add_to_args(parser) + + def extract_args(self, args: argparse.Namespace): + for val in self._config_map.values(): + k, v = val.get_value(args) + if v is not None: + config.__setattr__(k, v) + self.random.seed(self.joshua_seed, version=2) + + +config = Config() + +if __name__ == '__main__': + # test the config setup + parser = argparse.ArgumentParser('TestHarness Config Tester', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + config.dump() diff --git a/contrib/TestHarness2/test_harness/fdb.py b/contrib/TestHarness2/test_harness/fdb.py new file mode 100644 index 0000000000..1e6afa3906 --- /dev/null +++ b/contrib/TestHarness2/test_harness/fdb.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +from typing import OrderedDict, Tuple, List + +import collections +import fdb +import fdb.tuple +import struct + +from test_harness.run import StatFetcher, TestDescription +from test_harness.config import config +from test_harness.summarize import SummaryTree, Coverage + +# Before increasing this, make sure that all Joshua clusters (at Apple and Snowflake) have been upgraded. +# This version needs to be changed if we either need newer features from FDB or the current API version is +# getting retired. +fdb.api_version(630) + + +def str_to_tuple(s: str | None): + if s is None: + return s + return tuple(s.split(',')) + + +fdb_db = None + + +def open_db(cluster_file: str | None): + global fdb_db + if fdb_db is None: + fdb_db = fdb.open(cluster_file) + return fdb_db + + +def chunkify(iterable, sz: int): + res = [] + for item in iterable: + res.append(item) + if len(res) >= sz: + yield res + res = [] + if len(res) > 0: + yield res + + +@fdb.transactional +def write_coverage_chunk(tr, path: Tuple[str, ...], metadata: Tuple[str, ...], + coverage: List[Tuple[Coverage, bool]], initialized: bool) -> bool: + cov_dir = fdb.directory.create_or_open(tr, path) + if not initialized: + metadata_dir = fdb.directory.create_or_open(tr, metadata) + v = tr[metadata_dir['initialized']] + initialized = v.present() + for cov, covered in coverage: + if not initialized or covered: + tr.add(cov_dir.pack((cov.file, cov.line, cov.comment)), struct.pack(' OrderedDict[Coverage, int]: + res = collections.OrderedDict() + cov_dir = fdb.directory.create_or_open(tr, cov_path) + for k, v in tr[cov_dir.range()]: + file, line, comment = cov_dir.unpack(k) + count = struct.unpack(' OrderedDict[Coverage, int]: + db = open_db(cluster_file) + return _read_coverage(db, cov_path) + + +class TestStatistics: + def __init__(self, runtime: int, run_count: int): + self.runtime: int = runtime + self.run_count: int = run_count + + +class Statistics: + def __init__(self, cluster_file: str | None, joshua_dir: Tuple[str, ...]): + self.db = open_db(cluster_file) + self.stats_dir = self.open_stats_dir(self.db, joshua_dir) + self.stats: OrderedDict[str, TestStatistics] = self.read_stats_from_db(self.db) + + @fdb.transactional + def open_stats_dir(self, tr, app_dir: Tuple[str]): + stats_dir = app_dir + ('runtime_stats',) + return fdb.directory.create_or_open(tr, stats_dir) + + @fdb.transactional + def read_stats_from_db(self, tr) -> OrderedDict[str, TestStatistics]: + result = collections.OrderedDict() + for k, v in tr[self.stats_dir.range()]: + test_name = self.stats_dir.unpack(k)[0] + runtime, run_count = struct.unpack(' None: + key = self.stats_dir.pack((test_name,)) + tr.add(key, struct.pack(' None: + assert self.db is not None + self._write_runtime(self.db, test_name, time) + + +class FDBStatFetcher(StatFetcher): + def __init__(self, tests: OrderedDict[str, TestDescription], + joshua_dir: Tuple[str] = str_to_tuple(config.joshua_dir)): + super().__init__(tests) + self.statistics = Statistics(config.cluster_file, joshua_dir) + + def read_stats(self): + for k, v in self.statistics.stats.items(): + if k in self.tests.keys(): + self.tests[k].total_runtime = v.runtime + self.tests[k].num_runs = v.run_count + + def add_run_time(self, test_name: str, runtime: int, out: SummaryTree): + self.statistics.write_runtime(test_name, runtime) + super().add_run_time(test_name, runtime, out) diff --git a/contrib/TestHarness2/test_harness/joshua.py b/contrib/TestHarness2/test_harness/joshua.py new file mode 100644 index 0000000000..33c5881dcc --- /dev/null +++ b/contrib/TestHarness2/test_harness/joshua.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +import collections +import io +import sys +import xml.sax +import xml.sax.handler +from pathlib import Path +from typing import List, OrderedDict, Set + +from joshua import joshua_model + +import test_harness.run +from test_harness.config import config +from test_harness.summarize import SummaryTree + + +class ToSummaryTree(xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.root: SummaryTree | None = None + self.stack: List[SummaryTree] = [] + + def result(self) -> SummaryTree: + assert len(self.stack) == 0 and self.root is not None, 'Parse Error' + return self.root + + def startElement(self, name, attrs): + new_child = SummaryTree(name) + for k, v in attrs.items(): + new_child.attributes[k] = v + self.stack.append(new_child) + + def endElement(self, name): + closed = self.stack.pop() + assert closed.name == name + if len(self.stack) == 0: + self.root = closed + else: + self.stack[-1].children.append(closed) + + +def _print_summary(summary: SummaryTree, commands: Set[str]): + cmd = [] + if config.reproduce_prefix is not None: + cmd.append(config.reproduce_prefix) + cmd.append('fdbserver') + if 'TestFile' in summary.attributes: + file_name = summary.attributes['TestFile'] + role = 'test' if test_harness.run.is_no_sim(Path(file_name)) else 'simulation' + cmd += ['-r', role, '-f', file_name] + else: + cmd += ['-r', 'simulation', '-f', ''] + if 'RandomSeed' in summary.attributes: + cmd += ['-s', summary.attributes['RandomSeed']] + else: + cmd += ['-s', ''] + if 'BuggifyEnabled' in summary.attributes: + arg = 'on' + if summary.attributes['BuggifyEnabled'].lower() in ['0', 'off', 'false']: + arg = 'off' + cmd += ['-b', arg] + else: + cmd += ['b', ''] + cmd += ['--crash', '--trace_format', config.trace_format] + key = ' '.join(cmd) + count = 1 + while key in commands: + key = '{} # {}'.format(' '.join(cmd), count) + count += 1 + # we want the command as the first attribute + attributes = {'Command': ' '.join(cmd)} + for k, v in summary.attributes.items(): + if k == 'Errors': + attributes['ErrorCount'] = v + else: + attributes[k] = v + summary.attributes = attributes + if config.details: + key = str(len(commands)) + str_io = io.StringIO() + summary.dump(str_io, prefix=(' ' if config.pretty_print else '')) + if config.output_format == 'json': + sys.stdout.write('{}"Test{}": {}'.format(' ' if config.pretty_print else '', + key, str_io.getvalue())) + else: + sys.stdout.write(str_io.getvalue()) + if config.pretty_print: + sys.stdout.write('\n' if config.output_format == 'xml' else ',\n') + return key + error_count = 0 + warning_count = 0 + small_summary = SummaryTree('Test') + small_summary.attributes = attributes + errors = SummaryTree('Errors') + warnings = SummaryTree('Warnings') + buggifies: OrderedDict[str, List[int]] = collections.OrderedDict() + for child in summary.children: + if 'Severity' in child.attributes and child.attributes['Severity'] == '40' and error_count < config.max_errors: + error_count += 1 + errors.append(child) + if 'Severity' in child.attributes and child.attributes[ + 'Severity'] == '30' and warning_count < config.max_warnings: + warning_count += 1 + warnings.append(child) + if child.name == 'BuggifySection': + file = child.attributes['File'] + line = int(child.attributes['Line']) + buggifies.setdefault(file, []).append(line) + buggifies_elem = SummaryTree('Buggifies') + for file, lines in buggifies.items(): + lines.sort() + if config.output_format == 'json': + buggifies_elem.attributes[file] = ' '.join(str(line) for line in lines) + else: + child = SummaryTree('Buggify') + child.attributes['File'] = file + child.attributes['Lines'] = ' '.join(str(line) for line in lines) + small_summary.append(child) + small_summary.children.append(buggifies_elem) + if len(errors.children) > 0: + small_summary.children.append(errors) + if len(warnings.children) > 0: + small_summary.children.append(warnings) + output = io.StringIO() + small_summary.dump(output, prefix=(' ' if config.pretty_print else '')) + if config.output_format == 'json': + sys.stdout.write('{}"{}": {}'.format(' ' if config.pretty_print else '', key, output.getvalue().strip())) + else: + sys.stdout.write('{}{}'.format(' ' if config.pretty_print else '', output.getvalue().strip())) + sys.stdout.write('\n' if config.output_format == 'xml' else ',\n') + + +def print_errors(ensemble_id: str): + joshua_model.open(config.cluster_file) + properties = joshua_model.get_ensemble_properties(ensemble_id) + compressed = properties["compressed"] if "compressed" in properties else False + for rec in joshua_model.tail_results(ensemble_id, errors_only=(not config.success), compressed=compressed): + if len(rec) == 5: + version_stamp, result_code, host, seed, output = rec + elif len(rec) == 4: + version_stamp, result_code, host, output = rec + seed = None + elif len(rec) == 3: + version_stamp, result_code, output = rec + host = None + seed = None + elif len(rec) == 2: + version_stamp, seed = rec + output = str(joshua_model.fdb.tuple.unpack(seed)[0]) + "\n" + result_code = None + host = None + seed = None + else: + raise Exception("Unknown result format") + lines = output.splitlines() + commands: Set[str] = set() + for line in lines: + summary = ToSummaryTree() + xml.sax.parseString(line, summary) + commands.add(_print_summary(summary.result(), commands)) diff --git a/contrib/TestHarness2/test_harness/results.py b/contrib/TestHarness2/test_harness/results.py new file mode 100644 index 0000000000..486c497d35 --- /dev/null +++ b/contrib/TestHarness2/test_harness/results.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import argparse +import io +import json +import re +import sys +import test_harness.fdb + +from typing import List, Tuple, OrderedDict +from test_harness.summarize import SummaryTree, Coverage +from test_harness.config import config +from xml.sax.saxutils import quoteattr + + +class GlobalStatistics: + def __init__(self): + self.total_probes_hit: int = 0 + self.total_cpu_time: int = 0 + self.total_test_runs: int = 0 + self.total_missed_probes: int = 0 + + +class EnsembleResults: + def __init__(self, cluster_file: str | None, ensemble_id: str): + self.global_statistics = GlobalStatistics() + self.fdb_path = ('joshua', 'ensembles', 'results', 'application', ensemble_id) + self.coverage_path = self.fdb_path + ('coverage',) + self.statistics = test_harness.fdb.Statistics(cluster_file, self.fdb_path) + coverage_dict: OrderedDict[Coverage, int] = test_harness.fdb.read_coverage(cluster_file, self.coverage_path) + self.coverage: List[Tuple[Coverage, int]] = [] + self.min_coverage_hit: int | None = None + self.ratio = self.global_statistics.total_test_runs / config.hit_per_runs_ratio + for cov, count in coverage_dict.items(): + if re.search(config.cov_include_files, cov.file) is None: + continue + if re.search(config.cov_exclude_files, cov.file) is not None: + continue + self.global_statistics.total_probes_hit += count + self.coverage.append((cov, count)) + if count <= self.ratio: + self.global_statistics.total_missed_probes += 1 + if self.min_coverage_hit is None or self.min_coverage_hit > count: + self.min_coverage_hit = count + self.coverage.sort(key=lambda x: (x[1], x[0].file, x[0].line)) + self.stats: List[Tuple[str, int, int]] = [] + for k, v in self.statistics.stats.items(): + self.global_statistics.total_test_runs += v.run_count + self.global_statistics.total_cpu_time += v.runtime + self.stats.append((k, v.runtime, v.run_count)) + self.stats.sort(key=lambda x: x[1], reverse=True) + if self.min_coverage_hit is not None: + self.coverage_ok = self.min_coverage_hit > self.ratio + else: + self.coverage_ok = False + + def dump(self, prefix: str): + errors = 0 + out = SummaryTree('EnsembleResults') + out.attributes['TotalRuntime'] = str(self.global_statistics.total_cpu_time) + out.attributes['TotalTestRuns'] = str(self.global_statistics.total_test_runs) + out.attributes['TotalProbesHit'] = str(self.global_statistics.total_probes_hit) + out.attributes['MinProbeHit'] = str(self.min_coverage_hit) + out.attributes['TotalProbes'] = str(len(self.coverage)) + out.attributes['MissedProbes'] = str(self.global_statistics.total_missed_probes) + + for cov, count in self.coverage: + severity = 10 if count > self.ratio else 40 + if severity == 40: + errors += 1 + if (severity == 40 and errors <= config.max_errors) or config.details: + child = SummaryTree('CodeProbe') + child.attributes['Severity'] = str(severity) + child.attributes['File'] = cov.file + child.attributes['Line'] = str(cov.line) + child.attributes['Comment'] = '' if cov.comment is None else cov.comment + child.attributes['HitCount'] = str(count) + out.append(child) + + if config.details: + for k, runtime, run_count in self.stats: + child = SummaryTree('Test') + child.attributes['Name'] = k + child.attributes['Runtime'] = str(runtime) + child.attributes['RunCount'] = str(run_count) + out.append(child) + if errors > 0: + out.attributes['Errors'] = str(errors) + str_io = io.StringIO() + out.dump(str_io, prefix=prefix, new_line=config.pretty_print) + if config.output_format == 'xml': + sys.stdout.write(str_io.getvalue()) + else: + sys.stdout.write('{}"EnsembleResults":{}{}'.format(' ' if config.pretty_print else '', + '\n' if config.pretty_print else ' ', + str_io.getvalue())) + + +def write_header(ensemble_id: str): + if config.output_format == 'json': + if config.pretty_print: + print('{') + print(' "{}": {},\n'.format('ID', json.dumps(ensemble_id.strip()))) + else: + sys.stdout.write('{{{}: {},'.format('ID', json.dumps(ensemble_id.strip()))) + elif config.output_format == 'xml': + sys.stdout.write(''.format(quoteattr(ensemble_id.strip()))) + if config.pretty_print: + sys.stdout.write('\n') + else: + assert False, 'unknown output format {}'.format(config.output_format) + + +def write_footer(): + if config.output_format == 'xml': + sys.stdout.write('\n') + elif config.output_format == 'json': + sys.stdout.write('}\n') + else: + assert False, 'unknown output format {}'.format(config.output_format) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('TestHarness Results', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.change_default('pretty_print', True) + config.change_default('max_warnings', 0) + config.build_arguments(parser) + parser.add_argument('ensemble_id', type=str, help='The ensemble to fetch the result for') + args = parser.parse_args() + config.extract_args(args) + config.output_format = args.output_format + write_header(args.ensemble_id) + try: + import test_harness.joshua + test_harness.joshua.print_errors(args.ensemble_id) + except ModuleNotFoundError: + child = SummaryTree('JoshuaNotFound') + child.attributes['Severity'] = '30' + child.attributes['Message'] = 'Could not import Joshua -- set PYTHONPATH to joshua checkout dir' + child.dump(sys.stdout, prefix=(' ' if config.pretty_print else ''), new_line=config.pretty_print) + results = EnsembleResults(config.cluster_file, args.ensemble_id) + results.dump(' ' if config.pretty_print else '') + write_footer() + exit(0 if results.coverage_ok else 1) diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py new file mode 100644 index 0000000000..2cd24575fb --- /dev/null +++ b/contrib/TestHarness2/test_harness/run.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +import array +import base64 +import collections +import math +import os +import resource +import shutil +import subprocess +import re +import sys +import threading +import time +import uuid + +from functools import total_ordering +from pathlib import Path +from test_harness.version import Version +from test_harness.config import config +from typing import Dict, List, Pattern, OrderedDict + +from test_harness.summarize import Summary, SummaryTree + + +@total_ordering +class TestDescription: + def __init__(self, path: Path, name: str, priority: float): + self.paths: List[Path] = [path] + self.name = name + self.priority: float = priority + # we only measure in seconds. Otherwise, keeping determinism will be difficult + self.total_runtime: int = 0 + self.num_runs: int = 0 + + def __lt__(self, other): + if isinstance(other, TestDescription): + return self.name < other.name + else: + return self.name < str(other) + + def __eq__(self, other): + if isinstance(other, TestDescription): + return self.name < other.name + else: + return self.name < str(other.name) + + +class StatFetcher: + def __init__(self, tests: OrderedDict[str, TestDescription]): + self.tests = tests + + def read_stats(self): + pass + + def add_run_time(self, test_name: str, runtime: int, out: SummaryTree): + self.tests[test_name].total_runtime += runtime + + +class TestPicker: + def __init__(self, test_dir: Path): + if not test_dir.exists(): + raise RuntimeError('{} is neither a directory nor a file'.format(test_dir)) + self.include_files_regex = re.compile(config.include_test_files) + self.exclude_files_regex = re.compile(config.exclude_test_files) + self.include_tests_regex = re.compile(config.include_test_classes) + self.exclude_tests_regex = re.compile(config.exclude_test_names) + self.test_dir: Path = test_dir + self.tests: OrderedDict[str, TestDescription] = collections.OrderedDict() + self.restart_test: Pattern = re.compile(r".*-\d+\.(txt|toml)") + self.follow_test: Pattern = re.compile(r".*-[2-9]\d*\.(txt|toml)") + + for subdir in self.test_dir.iterdir(): + if subdir.is_dir() and subdir.name in config.test_dirs: + self.walk_test_dir(subdir) + self.stat_fetcher: StatFetcher + if config.stats is not None or config.joshua_dir is None: + self.stat_fetcher = StatFetcher(self.tests) + else: + from test_harness.fdb import FDBStatFetcher + self.stat_fetcher = FDBStatFetcher(self.tests) + if config.stats is not None: + self.load_stats(config.stats) + else: + self.fetch_stats() + + def add_time(self, test_file: Path, run_time: int, out: SummaryTree) -> None: + # getting the test name is fairly inefficient. But since we only have 100s of tests, I won't bother + test_name: str | None = None + test_desc: TestDescription | None = None + for name, test in self.tests.items(): + for p in test.paths: + test_files: List[Path] + if self.restart_test.match(p.name): + test_files = self.list_restart_files(p) + else: + test_files = [p] + for file in test_files: + if file.absolute() == test_file.absolute(): + test_name = name + test_desc = test + break + if test_name is not None: + break + if test_name is not None: + break + assert test_name is not None and test_desc is not None + self.stat_fetcher.add_run_time(test_name, run_time, out) + out.attributes['TotalTestTime'] = str(test_desc.total_runtime) + out.attributes['TestRunCount'] = str(test_desc.num_runs) + + def dump_stats(self) -> str: + res = array.array('I') + for _, spec in self.tests.items(): + res.append(spec.total_runtime) + return base64.standard_b64encode(res.tobytes()).decode('utf-8') + + def fetch_stats(self): + self.stat_fetcher.read_stats() + + def load_stats(self, serialized: str): + times = array.array('I') + times.frombytes(base64.standard_b64decode(serialized)) + assert len(times) == len(self.tests.items()) + for idx, (_, spec) in enumerate(self.tests.items()): + spec.total_runtime = times[idx] + + def parse_txt(self, path: Path): + if self.include_files_regex.search(str(path)) is None or self.exclude_files_regex.search(str(path)) is not None: + return + with path.open('r') as f: + test_name: str | None = None + test_class: str | None = None + priority: float | None = None + for line in f: + line = line.strip() + kv = line.split('=') + if len(kv) != 2: + continue + kv[0] = kv[0].strip() + kv[1] = kv[1].strip(' \r\n\t\'"') + if kv[0] == 'testTitle' and test_name is None: + test_name = kv[1] + if kv[0] == 'testClass' and test_class is None: + test_class = kv[1] + if kv[0] == 'testPriority' and priority is None: + try: + priority = float(kv[1]) + except ValueError: + raise RuntimeError("Can't parse {} -- testPriority in {} should be set to a float".format(kv[1], + path)) + if test_name is not None and test_class is not None and priority is not None: + break + if test_name is None: + return + if test_class is None: + test_class = test_name + if priority is None: + priority = 1.0 + if self.include_tests_regex.search(test_class) is None \ + or self.exclude_tests_regex.search(test_class) is not None: + return + if test_class not in self.tests: + self.tests[test_class] = TestDescription(path, test_class, priority) + else: + self.tests[test_class].paths.append(path) + + def walk_test_dir(self, test: Path): + if test.is_dir(): + for file in test.iterdir(): + self.walk_test_dir(file) + else: + # check whether we're looking at a restart test + if self.follow_test.match(test.name) is not None: + return + if test.suffix == '.txt' or test.suffix == '.toml': + self.parse_txt(test) + + @staticmethod + def list_restart_files(start_file: Path) -> List[Path]: + name = re.sub(r'-\d+.(txt|toml)', '', start_file.name) + res: List[Path] = [] + for test_file in start_file.parent.iterdir(): + if test_file.name.startswith(name): + res.append(test_file) + assert len(res) > 1 + res.sort() + return res + + def choose_test(self) -> List[Path]: + min_runtime: float | None = None + candidates: List[TestDescription] = [] + for _, v in self.tests.items(): + this_time = v.total_runtime * v.priority + if min_runtime is None or this_time < min_runtime: + min_runtime = this_time + candidates = [v] + elif this_time == min_runtime: + candidates.append(v) + candidates.sort() + choice = config.random.randint(0, len(candidates) - 1) + test = candidates[choice] + result = test.paths[config.random.randint(0, len(test.paths) - 1)] + if self.restart_test.match(result.name): + return self.list_restart_files(result) + else: + return [result] + + +class OldBinaries: + def __init__(self): + self.first_file_expr = re.compile(r'.*-1\.(txt|toml)') + self.old_binaries_path: Path = config.old_binaries_path + self.binaries: OrderedDict[Version, Path] = collections.OrderedDict() + if not self.old_binaries_path.exists() or not self.old_binaries_path.is_dir(): + return + exec_pattern = re.compile(r'fdbserver-\d+\.\d+\.\d+(\.exe)?') + for file in self.old_binaries_path.iterdir(): + if not file.is_file() or not os.access(file, os.X_OK): + continue + if exec_pattern.fullmatch(file.name) is not None: + self._add_file(file) + + def _add_file(self, file: Path): + version_str = file.name.split('-')[1] + if version_str.endswith('.exe'): + version_str = version_str[0:-len('.exe')] + ver = Version.parse(version_str) + self.binaries[ver] = file + + def choose_binary(self, test_file: Path) -> Path: + if len(self.binaries) == 0: + return config.binary + max_version = Version.max_version() + min_version = Version.parse('5.0.0') + dirs = test_file.parent.parts + if 'restarting' not in dirs: + return config.binary + version_expr = dirs[-1].split('_') + first_file = self.first_file_expr.match(test_file.name) is not None + if first_file and version_expr[0] == 'to': + # downgrade test -- first binary should be current one + return config.binary + if not first_file and version_expr[0] == 'from': + # upgrade test -- we only return an old version for the first test file + return config.binary + if version_expr[0] == 'from' or version_expr[0] == 'to': + min_version = Version.parse(version_expr[1]) + if len(version_expr) == 4 and version_expr[2] == 'until': + max_version = Version.parse(version_expr[3]) + candidates: List[Path] = [] + for ver, binary in self.binaries.items(): + if min_version <= ver <= max_version: + candidates.append(binary) + if len(candidates) == 0: + return config.binary + return config.random.choice(candidates) + + +def is_restarting_test(test_file: Path): + for p in test_file.parts: + if p == 'restarting': + return True + return False + + +def is_no_sim(test_file: Path): + return test_file.parts[-2] == 'noSim' + + +class ResourceMonitor(threading.Thread): + def __init__(self): + super().__init__() + self.start_time = time.time() + self.end_time: float | None = None + self._stop_monitor = False + self.max_rss = 0 + + def run(self) -> None: + while not self._stop_monitor: + time.sleep(1) + resources = resource.getrusage(resource.RUSAGE_CHILDREN) + self.max_rss = max(resources.ru_maxrss, self.max_rss) + + def stop(self): + self.end_time = time.time() + self._stop_monitor = True + + def time(self): + return self.end_time - self.start_time + + +class TestRun: + def __init__(self, binary: Path, test_file: Path, random_seed: int, uid: uuid.UUID, + restarting: bool = False, test_determinism: bool = False, buggify_enabled: bool = False, + stats: str | None = None, expected_unseed: int | None = None, will_restart: bool = False): + self.binary = binary + self.test_file = test_file + self.random_seed = random_seed + self.uid = uid + self.restarting = restarting + self.test_determinism = test_determinism + self.stats: str | None = stats + self.expected_unseed: int | None = expected_unseed + self.use_valgrind: bool = config.use_valgrind + self.old_binary_path: Path = config.old_binaries_path + self.buggify_enabled: bool = buggify_enabled + self.fault_injection_enabled: bool = True + self.trace_format: str | None = config.trace_format + if Version.of_binary(self.binary) < "6.1.0": + self.trace_format = None + self.use_tls_plugin = Version.of_binary(self.binary) < "5.2.0" + self.temp_path = config.run_dir / str(self.uid) + # state for the run + self.retryable_error: bool = False + self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed, + will_restart=will_restart) + self.run_time: int = 0 + self.success = self.run() + + def log_test_plan(self, out: SummaryTree): + test_plan: SummaryTree = SummaryTree('TestPlan') + test_plan.attributes['TestUID'] = str(self.uid) + test_plan.attributes['RandomSeed'] = str(self.random_seed) + test_plan.attributes['TestFile'] = str(self.test_file) + test_plan.attributes['Buggify'] = '1' if self.buggify_enabled else '0' + test_plan.attributes['FaultInjectionEnabled'] = '1' if self.fault_injection_enabled else '0' + test_plan.attributes['DeterminismCheck'] = '1' if self.test_determinism else '0' + out.append(test_plan) + + def delete_simdir(self): + shutil.rmtree(self.temp_path / Path('simfdb')) + + def run(self): + command: List[str] = [] + env: Dict[str, str] = os.environ.copy() + valgrind_file: Path | None = None + if self.use_valgrind and self.binary == config.binary: + # Only run the binary under test under valgrind. There's nothing we + # can do about valgrind errors in old binaries anyway, and it makes + # the test take longer. Also old binaries weren't built with + # USE_VALGRIND=ON, and we have seen false positives with valgrind in + # such binaries. + command.append('valgrind') + valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed)) + dbg_path = os.getenv('FDB_VALGRIND_DBGPATH') + if dbg_path is not None: + command.append('--extra-debuginfo-path={}'.format(dbg_path)) + command += ['--xml=yes', '--xml-file={}'.format(valgrind_file.absolute()), '-q'] + command += [str(self.binary.absolute()), + '-r', 'test' if is_no_sim(self.test_file) else 'simulation', + '-f', str(self.test_file), + '-s', str(self.random_seed)] + if self.trace_format is not None: + command += ['--trace_format', self.trace_format] + if self.use_tls_plugin: + command += ['--tls_plugin', str(config.tls_plugin_path)] + env["FDB_TLS_PLUGIN"] = str(config.tls_plugin_path) + if config.disable_kaio: + command += ['--knob-disable-posix-kernel-aio=1'] + if Version.of_binary(self.binary) >= '7.1.0': + command += ['-fi', 'on' if self.fault_injection_enabled else 'off'] + if self.restarting: + command.append('--restarting') + if self.buggify_enabled: + command += ['-b', 'on'] + if config.crash_on_error: + command.append('--crash') + + self.temp_path.mkdir(parents=True, exist_ok=True) + + # self.log_test_plan(out) + resources = ResourceMonitor() + resources.start() + process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path, + text=True, env=env) + did_kill = False + timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds + err_out: str + try: + _, err_out = process.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + _, err_out = process.communicate() + did_kill = True + resources.stop() + resources.join() + # we're rounding times up, otherwise we will prefer running very short tests (<1s) + self.run_time = math.ceil(resources.time()) + self.summary.runtime = resources.time() + self.summary.max_rss = resources.max_rss + self.summary.was_killed = did_kill + self.summary.valgrind_out_file = valgrind_file + self.summary.error_out = err_out + self.summary.summarize(self.temp_path, ' '.join(command)) + return self.summary.ok() + + +def decorate_summary(out: SummaryTree, test_file: Path, seed: int, buggify: bool): + """Sometimes a test can crash before ProgramStart is written to the traces. These + tests are then hard to reproduce (they can be reproduced through TestHarness but + require the user to run in the joshua docker container). To account for this we + will write the necessary information into the attributes if it is missing.""" + if 'TestFile' not in out.attributes: + out.attributes['TestFile'] = str(test_file) + if 'RandomSeed' not in out.attributes: + out.attributes['RandomSeed'] = str(seed) + if 'BuggifyEnabled' not in out.attributes: + out.attributes['BuggifyEnabled'] = '1' if buggify else '0' + + +class TestRunner: + def __init__(self): + self.uid = uuid.uuid4() + self.test_path: Path = Path('tests') + self.cluster_file: str | None = None + self.fdb_app_dir: str | None = None + self.binary_chooser = OldBinaries() + self.test_picker = TestPicker(self.test_path) + + def backup_sim_dir(self, seed: int): + temp_dir = config.run_dir / str(self.uid) + src_dir = temp_dir / 'simfdb' + assert src_dir.is_dir() + dest_dir = temp_dir / 'simfdb.{}'.format(seed) + assert not dest_dir.exists() + shutil.copytree(src_dir, dest_dir) + + def restore_sim_dir(self, seed: int): + temp_dir = config.run_dir / str(self.uid) + src_dir = temp_dir / 'simfdb.{}'.format(seed) + assert src_dir.exists() + dest_dir = temp_dir / 'simfdb' + shutil.rmtree(dest_dir) + shutil.move(src_dir, dest_dir) + + def run_tests(self, test_files: List[Path], seed: int, test_picker: TestPicker) -> bool: + result: bool = True + for count, file in enumerate(test_files): + will_restart = count + 1 < len(test_files) + binary = self.binary_chooser.choose_binary(file) + unseed_check = not is_no_sim(file) and config.random.random() < config.unseed_check_ratio + buggify_enabled: bool = config.random.random() < config.buggify_on_ratio + if unseed_check and count != 0: + # for restarting tests we will need to restore the sim2 after the first run + self.backup_sim_dir(seed + count - 1) + run = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0, + stats=test_picker.dump_stats(), will_restart=will_restart, buggify_enabled=buggify_enabled) + result = result and run.success + test_picker.add_time(test_files[0], run.run_time, run.summary.out) + decorate_summary(run.summary.out, file, seed + count, run.buggify_enabled) + if unseed_check and run.summary.unseed: + run.summary.out.append(run.summary.list_simfdb()) + run.summary.out.dump(sys.stdout) + if not result: + return False + if unseed_check and run.summary.unseed is not None: + if count != 0: + self.restore_sim_dir(seed + count - 1) + run2 = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0, + stats=test_picker.dump_stats(), expected_unseed=run.summary.unseed, + will_restart=will_restart, buggify_enabled=buggify_enabled) + test_picker.add_time(file, run2.run_time, run.summary.out) + decorate_summary(run2.summary.out, file, seed + count, run.buggify_enabled) + run2.summary.out.dump(sys.stdout) + result = result and run2.success + if not result: + return False + return result + + def run(self) -> bool: + seed = config.random_seed if config.random_seed is not None else config.random.randint(0, 2 ** 32 - 1) + test_files = self.test_picker.choose_test() + success = self.run_tests(test_files, seed, self.test_picker) + if config.clean_up: + shutil.rmtree(config.run_dir / str(self.uid)) + return success diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py new file mode 100644 index 0000000000..54b2f799b5 --- /dev/null +++ b/contrib/TestHarness2/test_harness/summarize.py @@ -0,0 +1,620 @@ +from __future__ import annotations + +import collections +import inspect +import json +import os +import re +import sys +import traceback +import uuid +import xml.sax +import xml.sax.handler +import xml.sax.saxutils + +from pathlib import Path +from typing import List, Dict, TextIO, Callable, Optional, OrderedDict, Any, Tuple, Iterator, Iterable + +from test_harness.config import config +from test_harness.valgrind import parse_valgrind_output + + +class SummaryTree: + def __init__(self, name: str): + self.name = name + self.children: List[SummaryTree] = [] + self.attributes: Dict[str, str] = {} + + def append(self, element: SummaryTree): + self.children.append(element) + + def to_dict(self, add_name: bool = True) -> Dict[str, Any] | List[Any]: + if len(self.children) > 0 and len(self.attributes) == 0: + children = [] + for child in self.children: + children.append(child.to_dict()) + if add_name: + return {self.name: children} + else: + return children + res: Dict[str, Any] = {} + if add_name: + res['Type'] = self.name + for k, v in self.attributes.items(): + res[k] = v + children = [] + child_keys: Dict[str, int] = {} + for child in self.children: + if child.name in child_keys: + child_keys[child.name] += 1 + else: + child_keys[child.name] = 1 + for child in self.children: + if child_keys[child.name] == 1 and child.name not in self.attributes: + res[child.name] = child.to_dict(add_name=False) + else: + children.append(child.to_dict()) + if len(children) > 0: + res['children'] = children + return res + + def to_json(self, out: TextIO, prefix: str = ''): + res = json.dumps(self.to_dict(), indent=(' ' if config.pretty_print else None)) + for line in res.splitlines(False): + out.write('{}{}\n'.format(prefix, line)) + + def to_xml(self, out: TextIO, prefix: str = ''): + # minidom doesn't support omitting the xml declaration which is a problem for joshua + # However, our xml is very simple and therefore serializing manually is easy enough + attrs = [] + print_width = 120 + try: + print_width, _ = os.get_terminal_size() + except OSError: + pass + for k, v in self.attributes.items(): + attrs.append('{}={}'.format(k, xml.sax.saxutils.quoteattr(v))) + elem = '{}<{}{}'.format(prefix, self.name, ('' if len(attrs) == 0 else ' ')) + out.write(elem) + if config.pretty_print: + curr_line_len = len(elem) + for i in range(len(attrs)): + attr_len = len(attrs[i]) + if i == 0 or attr_len + curr_line_len + 1 <= print_width: + if i != 0: + out.write(' ') + out.write(attrs[i]) + curr_line_len += attr_len + else: + out.write('\n') + out.write(' ' * len(elem)) + out.write(attrs[i]) + curr_line_len = len(elem) + attr_len + else: + out.write(' '.join(attrs)) + if len(self.children) == 0: + out.write('/>') + else: + out.write('>') + for child in self.children: + if config.pretty_print: + out.write('\n') + child.to_xml(out, prefix=(' {}'.format(prefix) if config.pretty_print else prefix)) + if len(self.children) > 0: + out.write('{}{}'.format(('\n' if config.pretty_print else ''), prefix, self.name)) + + def dump(self, out: TextIO, prefix: str = '', new_line: bool = True): + if config.output_format == 'json': + self.to_json(out, prefix=prefix) + else: + self.to_xml(out, prefix=prefix) + if new_line: + out.write('\n') + + +ParserCallback = Callable[[Dict[str, str]], Optional[str]] + + +class ParseHandler: + def __init__(self, out: SummaryTree): + self.out = out + self.events: OrderedDict[Optional[Tuple[str, Optional[str]]], List[ParserCallback]] = collections.OrderedDict() + + def add_handler(self, attr: Tuple[str, Optional[str]], callback: ParserCallback) -> None: + self.events.setdefault(attr, []).append(callback) + + def _call(self, callback: ParserCallback, attrs: Dict[str, str]) -> str | None: + try: + return callback(attrs) + except Exception as e: + _, _, exc_traceback = sys.exc_info() + child = SummaryTree('NonFatalParseError') + child.attributes['Severity'] = '30' + child.attributes['ErrorMessage'] = str(e) + child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + self.out.append(child) + return None + + def handle(self, attrs: Dict[str, str]): + if None in self.events: + for callback in self.events[None]: + self._call(callback, attrs) + for k, v in attrs.items(): + if (k, None) in self.events: + for callback in self.events[(k, None)]: + remap = self._call(callback, attrs) + if remap is not None: + v = remap + attrs[k] = v + if (k, v) in self.events: + for callback in self.events[(k, v)]: + remap = self._call(callback, attrs) + if remap is not None: + v = remap + attrs[k] = v + + +class Parser: + def parse(self, file: TextIO, handler: ParseHandler) -> None: + pass + + +class XmlParser(Parser, xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.handler: ParseHandler | None = None + + def parse(self, file: TextIO, handler: ParseHandler) -> None: + xml.sax.parse(file, self) + + def startElement(self, name, attrs) -> None: + attributes: Dict[str, str] = {} + for name in attrs.getNames(): + attributes[name] = attrs.getValue(name) + assert self.handler is not None + self.handler.handle(attributes) + + +class JsonParser(Parser): + def __init__(self): + super().__init__() + + def parse(self, file: TextIO, handler: ParseHandler): + for line in file: + obj = json.loads(line) + handler.handle(obj) + + +class Coverage: + def __init__(self, file: str, line: str | int, comment: str | None = None): + self.file = file + self.line = int(line) + self.comment = comment + + def to_tuple(self) -> Tuple[str, int, str | None]: + return self.file, self.line, self.comment + + def __eq__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() == other + elif isinstance(other, Coverage): + return self.to_tuple() == other.to_tuple() + else: + return False + + def __lt__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() < other + elif isinstance(other, Coverage): + return self.to_tuple() < other.to_tuple() + else: + return False + + def __le__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() <= other + elif isinstance(other, Coverage): + return self.to_tuple() <= other.to_tuple() + else: + return False + + def __gt__(self, other: Coverage) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() > other + elif isinstance(other, Coverage): + return self.to_tuple() > other.to_tuple() + else: + return False + + def __ge__(self, other): + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() >= other + elif isinstance(other, Coverage): + return self.to_tuple() >= other.to_tuple() + else: + return False + + def __hash__(self): + return hash((self.file, self.line, self.comment)) + + +class TraceFiles: + def __init__(self, path: Path): + self.path: Path = path + self.timestamps: List[int] = [] + self.runs: OrderedDict[int, List[Path]] = collections.OrderedDict() + trace_expr = re.compile(r'trace.*\.(json|xml)') + for file in self.path.iterdir(): + if file.is_file() and trace_expr.match(file.name) is not None: + ts = int(file.name.split('.')[6]) + if ts in self.runs: + self.runs[ts].append(file) + else: + self.timestamps.append(ts) + self.runs[ts] = [file] + self.timestamps.sort(reverse=True) + + def __getitem__(self, idx: int) -> List[Path]: + res = self.runs[self.timestamps[idx]] + res.sort() + return res + + def __len__(self) -> int: + return len(self.runs) + + def items(self) -> Iterator[List[Path]]: + class TraceFilesIterator(Iterable[List[Path]]): + def __init__(self, trace_files: TraceFiles): + self.current = 0 + self.trace_files: TraceFiles = trace_files + + def __iter__(self): + return self + + def __next__(self) -> List[Path]: + if len(self.trace_files) <= self.current: + raise StopIteration + self.current += 1 + return self.trace_files[self.current - 1] + return TraceFilesIterator(self) + + +class Summary: + def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None, + was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None, + exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None, + error_out: str = None, will_restart: bool = False): + self.binary = binary + self.runtime: float = runtime + self.max_rss: int | None = max_rss + self.was_killed: bool = was_killed + self.expected_unseed: int | None = expected_unseed + self.exit_code: int = exit_code + self.out: SummaryTree = SummaryTree('Test') + self.test_begin_found: bool = False + self.test_end_found: bool = False + self.unseed: int | None = None + self.valgrind_out_file: Path | None = valgrind_out_file + self.severity_map: OrderedDict[tuple[str, int], int] = collections.OrderedDict() + self.error: bool = False + self.errors: int = 0 + self.warnings: int = 0 + self.coverage: OrderedDict[Coverage, bool] = collections.OrderedDict() + self.test_count: int = 0 + self.tests_passed: int = 0 + self.error_out = error_out + self.stderr_severity: str = '40' + self.will_restart: bool = will_restart + self.test_dir: Path | None = None + + if uid is not None: + self.out.attributes['TestUID'] = str(uid) + if stats is not None: + self.out.attributes['Statistics'] = stats + self.out.attributes['JoshuaSeed'] = str(config.joshua_seed) + self.out.attributes['WillRestart'] = '1' if self.will_restart else '0' + + self.handler = ParseHandler(self.out) + self.register_handlers() + + def summarize_files(self, trace_files: List[Path]): + assert len(trace_files) > 0 + for f in trace_files: + self.parse_file(f) + self.done() + + def summarize(self, trace_dir: Path, command: str): + self.test_dir = trace_dir + trace_files = TraceFiles(trace_dir) + if len(trace_files) == 0: + self.error = True + child = SummaryTree('NoTracesFound') + child.attributes['Severity'] = '40' + child.attributes['Path'] = str(trace_dir.absolute()) + child.attributes['Command'] = command + self.out.append(child) + return + self.summarize_files(trace_files[0]) + if config.joshua_dir is not None: + import test_harness.fdb + test_harness.fdb.write_coverage(config.cluster_file, + test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage',), + test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage-metadata',), + self.coverage) + + def list_simfdb(self) -> SummaryTree: + res = SummaryTree('SimFDB') + res.attributes['TestDir'] = str(self.test_dir) + if self.test_dir is None: + return res + simfdb = self.test_dir / Path('simfdb') + if not simfdb.exists(): + res.attributes['NoSimDir'] = "simfdb doesn't exist" + return res + elif not simfdb.is_dir(): + res.attributes['NoSimDir'] = 'simfdb is not a directory' + return res + for file in simfdb.iterdir(): + child = SummaryTree('Directory' if file.is_dir() else 'File') + child.attributes['Name'] = file.name + res.append(child) + return res + + def ok(self): + return not self.error + + def done(self): + if config.print_coverage: + for k, v in self.coverage.items(): + child = SummaryTree('CodeCoverage') + child.attributes['File'] = k.file + child.attributes['Line'] = str(k.line) + if not v: + child.attributes['Covered'] = '0' + if k.comment is not None and len(k.comment): + child.attributes['Comment'] = k.comment + self.out.append(child) + if self.warnings > config.max_warnings: + child = SummaryTree('WarningLimitExceeded') + child.attributes['Severity'] = '30' + child.attributes['WarningCount'] = str(self.warnings) + self.out.append(child) + if self.errors > config.max_errors: + child = SummaryTree('ErrorLimitExceeded') + child.attributes['Severity'] = '40' + child.attributes['ErrorCount'] = str(self.errors) + self.out.append(child) + if self.was_killed: + child = SummaryTree('ExternalTimeout') + child.attributes['Severity'] = '40' + self.out.append(child) + self.error = True + if self.max_rss is not None: + self.out.attributes['PeakMemory'] = str(self.max_rss) + if self.valgrind_out_file is not None: + try: + valgrind_errors = parse_valgrind_output(self.valgrind_out_file) + for valgrind_error in valgrind_errors: + if valgrind_error.kind.startswith('Leak'): + continue + self.error = True + child = SummaryTree('ValgrindError') + child.attributes['Severity'] = '40' + child.attributes['What'] = valgrind_error.what.what + child.attributes['Backtrace'] = valgrind_error.what.backtrace + aux_count = 0 + for aux in valgrind_error.aux: + child.attributes['WhatAux{}'.format(aux_count)] = aux.what + child.attributes['BacktraceAux{}'.format(aux_count)] = aux.backtrace + aux_count += 1 + self.out.append(child) + except Exception as e: + self.error = True + child = SummaryTree('ValgrindParseError') + child.attributes['Severity'] = '40' + child.attributes['ErrorMessage'] = str(e) + _, _, exc_traceback = sys.exc_info() + child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + self.out.append(child) + if not self.test_end_found: + child = SummaryTree('TestUnexpectedlyNotFinished') + child.attributes['Severity'] = '40' + self.out.append(child) + if self.error_out is not None and len(self.error_out) > 0: + lines = self.error_out.splitlines() + stderr_bytes = 0 + for line in lines: + if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"): + # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives. + continue + if line.endswith("Warning: unimplemented fcntl command: 1036"): + # Valgrind produces this warning when F_SET_RW_HINT is used + continue + if self.stderr_severity == '40': + self.error = True + remaining_bytes = config.max_stderr_bytes - stderr_bytes + if remaining_bytes > 0: + out_err = line[0:remaining_bytes] + ('...' if len(line) > remaining_bytes else '') + child = SummaryTree('StdErrOutput') + child.attributes['Severity'] = self.stderr_severity + child.attributes['Output'] = out_err + self.out.append(child) + stderr_bytes += len(line) + if stderr_bytes > config.max_stderr_bytes: + child = SummaryTree('StdErrOutputTruncated') + child.attributes['Severity'] = self.stderr_severity + child.attributes['BytesRemaining'] = str(stderr_bytes - config.max_stderr_bytes) + self.out.append(child) + + self.out.attributes['Ok'] = '1' if self.ok() else '0' + if not self.ok(): + reason = 'Unknown' + if self.error: + reason = 'ProducedErrors' + elif not self.test_end_found: + reason = 'TestDidNotFinish' + elif self.tests_passed == 0: + reason = 'NoTestsPassed' + elif self.test_count != self.tests_passed: + reason = 'Expected {} tests to pass, but only {} did'.format(self.test_count, self.tests_passed) + self.out.attributes['FailReason'] = reason + + def parse_file(self, file: Path): + parser: Parser + if file.suffix == '.json': + parser = JsonParser() + elif file.suffix == '.xml': + parser = XmlParser() + else: + child = SummaryTree('TestHarnessBug') + child.attributes['File'] = __file__ + frame = inspect.currentframe() + if frame is not None: + child.attributes['Line'] = str(inspect.getframeinfo(frame).lineno) + child.attributes['Details'] = 'Unexpected suffix {} for file {}'.format(file.suffix, file.name) + self.error = True + self.out.append(child) + return + with file.open('r') as f: + try: + parser.parse(f, self.handler) + except Exception as e: + child = SummaryTree('SummarizationError') + child.attributes['Severity'] = '40' + child.attributes['ErrorMessage'] = str(e) + self.out.append(child) + + def register_handlers(self): + def remap_event_severity(attrs): + if 'Type' not in attrs or 'Severity' not in attrs: + return None + k = (attrs['Type'], int(attrs['Severity'])) + if k in self.severity_map: + return str(self.severity_map[k]) + + self.handler.add_handler(('Severity', None), remap_event_severity) + + def program_start(attrs: Dict[str, str]): + if self.test_begin_found: + return + self.test_begin_found = True + self.out.attributes['RandomSeed'] = attrs['RandomSeed'] + self.out.attributes['SourceVersion'] = attrs['SourceVersion'] + self.out.attributes['Time'] = attrs['ActualTime'] + self.out.attributes['BuggifyEnabled'] = attrs['BuggifyEnabled'] + self.out.attributes['DeterminismCheck'] = '0' if self.expected_unseed is None else '1' + if self.binary.name != 'fdbserver': + self.out.attributes['OldBinary'] = self.binary.name + if 'FaultInjectionEnabled' in attrs: + self.out.attributes['FaultInjectionEnabled'] = attrs['FaultInjectionEnabled'] + + self.handler.add_handler(('Type', 'ProgramStart'), program_start) + + def set_test_file(attrs: Dict[str, str]): + test_file = Path(attrs['TestFile']) + cwd = Path('.').absolute() + try: + test_file = test_file.relative_to(cwd) + except ValueError: + pass + self.out.attributes['TestFile'] = str(test_file) + + self.handler.add_handler(('Type', 'Simulation'), set_test_file) + self.handler.add_handler(('Type', 'NonSimulationTest'), set_test_file) + + def set_elapsed_time(attrs: Dict[str, str]): + if self.test_end_found: + return + self.test_end_found = True + self.unseed = int(attrs['RandomUnseed']) + if self.expected_unseed is not None and self.unseed != self.expected_unseed: + severity = 40 if ('UnseedMismatch', 40) not in self.severity_map \ + else self.severity_map[('UnseedMismatch', 40)] + if severity >= 30: + child = SummaryTree('UnseedMismatch') + child.attributes['Unseed'] = str(self.unseed) + child.attributes['ExpectedUnseed'] = str(self.expected_unseed) + child.attributes['Severity'] = str(severity) + if severity >= 40: + self.error = True + self.out.append(child) + self.out.attributes['SimElapsedTime'] = attrs['SimTime'] + self.out.attributes['RealElapsedTime'] = attrs['RealTime'] + if self.unseed is not None: + self.out.attributes['RandomUnseed'] = str(self.unseed) + + self.handler.add_handler(('Type', 'ElapsedTime'), set_elapsed_time) + + def parse_warning(attrs: Dict[str, str]): + self.warnings += 1 + if self.warnings > config.max_warnings: + return + child = SummaryTree(attrs['Type']) + for k, v in attrs.items(): + if k != 'Type': + child.attributes[k] = v + self.out.append(child) + + self.handler.add_handler(('Severity', '30'), parse_warning) + + def parse_error(attrs: Dict[str, str]): + self.errors += 1 + self.error = True + if self.errors > config.max_errors: + return + child = SummaryTree(attrs['Type']) + for k, v in attrs.items(): + child.attributes[k] = v + self.out.append(child) + + self.handler.add_handler(('Severity', '40'), parse_error) + + def coverage(attrs: Dict[str, str]): + covered = True + if 'Covered' in attrs: + covered = int(attrs['Covered']) != 0 + comment = '' + if 'Comment' in attrs: + comment = attrs['Comment'] + c = Coverage(attrs['File'], attrs['Line'], comment) + if covered or c not in self.coverage: + self.coverage[c] = covered + + self.handler.add_handler(('Type', 'CodeCoverage'), coverage) + + def expected_test_pass(attrs: Dict[str, str]): + self.test_count = int(attrs['Count']) + + self.handler.add_handler(('Type', 'TestsExpectedToPass'), expected_test_pass) + + def test_passed(attrs: Dict[str, str]): + if attrs['Passed'] == '1': + self.tests_passed += 1 + + self.handler.add_handler(('Type', 'TestResults'), test_passed) + + def remap_event_severity(attrs: Dict[str, str]): + self.severity_map[(attrs['TargetEvent'], int(attrs['OriginalSeverity']))] = int(attrs['NewSeverity']) + + self.handler.add_handler(('Type', 'RemapEventSeverity'), remap_event_severity) + + def buggify_section(attrs: Dict[str, str]): + if attrs['Type'] == 'FaultInjected' or attrs.get('Activated', '0') == '1': + child = SummaryTree(attrs['Type']) + child.attributes['File'] = attrs['File'] + child.attributes['Line'] = attrs['Line'] + self.out.append(child) + self.handler.add_handler(('Type', 'BuggifySection'), buggify_section) + self.handler.add_handler(('Type', 'FaultInjected'), buggify_section) + + def running_unit_test(attrs: Dict[str, str]): + child = SummaryTree('RunningUnitTest') + child.attributes['Name'] = attrs['Name'] + child.attributes['File'] = attrs['File'] + child.attributes['Line'] = attrs['Line'] + self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test) + + def stderr_severity(attrs: Dict[str, str]): + if 'NewSeverity' in attrs: + self.stderr_severity = attrs['NewSeverity'] + self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity) diff --git a/contrib/TestHarness2/test_harness/test_valgrind_parser.py b/contrib/TestHarness2/test_harness/test_valgrind_parser.py new file mode 100644 index 0000000000..0b36e8e6d5 --- /dev/null +++ b/contrib/TestHarness2/test_harness/test_valgrind_parser.py @@ -0,0 +1,16 @@ +import sys + +from test_harness.valgrind import parse_valgrind_output +from pathlib import Path + + +if __name__ == '__main__': + errors = parse_valgrind_output(Path(sys.argv[1])) + for valgrind_error in errors: + print('ValgrindError: what={}, kind={}'.format(valgrind_error.what.what, valgrind_error.kind)) + print('Backtrace: {}'.format(valgrind_error.what.backtrace)) + counter = 0 + for aux in valgrind_error.aux: + print('Aux {}:'.format(counter)) + print(' What: {}'.format(aux.what)) + print(' Backtrace: {}'.format(aux.backtrace)) diff --git a/contrib/TestHarness2/test_harness/timeout.py b/contrib/TestHarness2/test_harness/timeout.py new file mode 100644 index 0000000000..90af7096fd --- /dev/null +++ b/contrib/TestHarness2/test_harness/timeout.py @@ -0,0 +1,60 @@ +import argparse +import re +import sys + +from pathlib import Path +from test_harness.config import config +from test_harness.summarize import Summary, TraceFiles +from typing import Pattern, List + + +def files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]: + res: List[Path] = [] + for file in path.iterdir(): + if file.is_file() and pattern.match(file.name) is not None: + res.append(file) + elif file.is_dir() and recurse: + res += files_matching(file, pattern, recurse) + return res + + +def dirs_with_files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]: + res: List[Path] = [] + sub_directories: List[Path] = [] + has_file = False + for file in path.iterdir(): + if file.is_file() and pattern.match(file.name) is not None: + has_file = True + elif file.is_dir() and recurse: + sub_directories.append(file) + if has_file: + res.append(path) + if recurse: + for file in sub_directories: + res += dirs_with_files_matching(file, pattern, recurse=True) + res.sort() + return res + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('TestHarness Timeout', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + valgrind_files: List[Path] = [] + if config.use_valgrind: + valgrind_files = files_matching(Path.cwd(), re.compile(r'valgrind.*\.xml')) + + for directory in dirs_with_files_matching(Path.cwd(), re.compile(r'trace.*\.(json|xml)'), recurse=True): + trace_files = TraceFiles(directory) + for files in trace_files.items(): + if config.use_valgrind: + for valgrind_file in valgrind_files: + summary = Summary(Path('bin/fdbserver'), was_killed=True) + summary.valgrind_out_file = valgrind_file + summary.summarize_files(files) + summary.out.dump(sys.stdout) + else: + summary = Summary(Path('bin/fdbserver'), was_killed=True) + summary.summarize_files(files) + summary.out.dump(sys.stdout) diff --git a/contrib/TestHarness2/test_harness/valgrind.py b/contrib/TestHarness2/test_harness/valgrind.py new file mode 100644 index 0000000000..399b47c0cc --- /dev/null +++ b/contrib/TestHarness2/test_harness/valgrind.py @@ -0,0 +1,141 @@ +import enum +import xml +import xml.sax.handler +from pathlib import Path +from typing import List + + +class ValgrindWhat: + def __init__(self): + self.what: str = '' + self.backtrace: str = '' + + +class ValgrindError: + def __init__(self): + self.what: ValgrindWhat = ValgrindWhat() + self.kind: str = '' + self.aux: List[ValgrindWhat] = [] + + +# noinspection PyArgumentList +class ValgrindParseState(enum.Enum): + ROOT = enum.auto() + ERROR = enum.auto() + ERROR_AUX = enum.auto() + KIND = enum.auto() + WHAT = enum.auto() + TRACE = enum.auto() + AUX_WHAT = enum.auto() + STACK = enum.auto() + STACK_AUX = enum.auto() + STACK_IP = enum.auto() + STACK_IP_AUX = enum.auto() + + +class ValgrindHandler(xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.stack: List[ValgrindError] = [] + self.result: List[ValgrindError] = [] + self.state_stack: List[ValgrindParseState] = [] + + def state(self) -> ValgrindParseState: + if len(self.state_stack) == 0: + return ValgrindParseState.ROOT + return self.state_stack[-1] + + @staticmethod + def from_content(content): + # pdb.set_trace() + if isinstance(content, bytes): + return content.decode() + assert isinstance(content, str) + return content + + def characters(self, content): + # pdb.set_trace() + state = self.state() + if len(self.state_stack) == 0: + return + else: + assert len(self.stack) > 0 + if state is ValgrindParseState.KIND: + self.stack[-1].kind += self.from_content(content) + elif state is ValgrindParseState.WHAT: + self.stack[-1].what.what += self.from_content(content) + elif state is ValgrindParseState.AUX_WHAT: + self.stack[-1].aux[-1].what += self.from_content(content) + elif state is ValgrindParseState.STACK_IP: + self.stack[-1].what.backtrace += self.from_content(content) + elif state is ValgrindParseState.STACK_IP_AUX: + self.stack[-1].aux[-1].backtrace += self.from_content(content) + + def startElement(self, name, attrs): + # pdb.set_trace() + if name == 'error': + self.stack.append(ValgrindError()) + self.state_stack.append(ValgrindParseState.ERROR) + if len(self.stack) == 0: + return + if name == 'kind': + self.state_stack.append(ValgrindParseState.KIND) + elif name == 'what': + self.state_stack.append(ValgrindParseState.WHAT) + elif name == 'auxwhat': + assert self.state() in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX] + self.state_stack.pop() + self.state_stack.append(ValgrindParseState.ERROR_AUX) + self.state_stack.append(ValgrindParseState.AUX_WHAT) + self.stack[-1].aux.append(ValgrindWhat()) + elif name == 'stack': + state = self.state() + assert state in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX] + if state == ValgrindParseState.ERROR: + self.state_stack.append(ValgrindParseState.STACK) + else: + self.state_stack.append(ValgrindParseState.STACK_AUX) + elif name == 'ip': + state = self.state() + assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + if state == ValgrindParseState.STACK: + self.state_stack.append(ValgrindParseState.STACK_IP) + if len(self.stack[-1].what.backtrace) == 0: + self.stack[-1].what.backtrace = 'addr2line -e fdbserver.debug -p -C -f -i ' + else: + self.stack[-1].what.backtrace += ' ' + else: + self.state_stack.append(ValgrindParseState.STACK_IP_AUX) + if len(self.stack[-1].aux[-1].backtrace) == 0: + self.stack[-1].aux[-1].backtrace = 'addr2line -e fdbserver.debug -p -C -f -i ' + else: + self.stack[-1].aux[-1].backtrace += ' ' + + def endElement(self, name): + # pdb.set_trace() + if name == 'error': + self.result.append(self.stack.pop()) + self.state_stack.pop() + elif name == 'kind': + assert self.state() == ValgrindParseState.KIND + self.state_stack.pop() + elif name == 'what': + assert self.state() == ValgrindParseState.WHAT + self.state_stack.pop() + elif name == 'auxwhat': + assert self.state() == ValgrindParseState.AUX_WHAT + self.state_stack.pop() + elif name == 'stack': + assert self.state() in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + self.state_stack.pop() + elif name == 'ip': + self.state_stack.pop() + state = self.state() + assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + + +def parse_valgrind_output(valgrind_out_file: Path) -> List[ValgrindError]: + handler = ValgrindHandler() + with valgrind_out_file.open('r') as f: + xml.sax.parse(f, handler) + return handler.result diff --git a/contrib/TestHarness2/test_harness/version.py b/contrib/TestHarness2/test_harness/version.py new file mode 100644 index 0000000000..fe04206a8a --- /dev/null +++ b/contrib/TestHarness2/test_harness/version.py @@ -0,0 +1,66 @@ +from functools import total_ordering +from pathlib import Path +from typing import Tuple + + +@total_ordering +class Version: + def __init__(self): + self.major: int = 0 + self.minor: int = 0 + self.patch: int = 0 + + def version_tuple(self): + return self.major, self.minor, self.patch + + def _compare(self, other) -> int: + lhs: Tuple[int, int, int] = self.version_tuple() + rhs: Tuple[int, int, int] + if isinstance(other, Version): + rhs = other.version_tuple() + else: + rhs = Version.parse(str(other)).version_tuple() + if lhs < rhs: + return -1 + elif lhs > rhs: + return 1 + else: + return 0 + + def __eq__(self, other) -> bool: + return self._compare(other) == 0 + + def __lt__(self, other) -> bool: + return self._compare(other) < 0 + + def __hash__(self): + return hash(self.version_tuple()) + + def __str__(self): + return format('{}.{}.{}'.format(self.major, self.minor, self.patch)) + + @staticmethod + def of_binary(binary: Path): + parts = binary.name.split('-') + if len(parts) != 2: + return Version.max_version() + return Version.parse(parts[1]) + + @staticmethod + def parse(version: str): + version_tuple = version.split('.') + self = Version() + self.major = int(version_tuple[0]) + if len(version_tuple) > 1: + self.minor = int(version_tuple[1]) + if len(version_tuple) > 2: + self.patch = int(version_tuple[2]) + return self + + @staticmethod + def max_version(): + self = Version() + self.major = 2**32 - 1 + self.minor = 2**32 - 1 + self.patch = 2**32 - 1 + return self diff --git a/contrib/crc32/CMakeLists.txt b/contrib/crc32/CMakeLists.txt index f93697c1a9..3fb87e1901 100644 --- a/contrib/crc32/CMakeLists.txt +++ b/contrib/crc32/CMakeLists.txt @@ -1,2 +1,8 @@ add_library(crc32 STATIC crc32.S crc32_wrapper.c crc32c.cpp) +if (CLANG) + # This is necessary for clang since the compiler reports that crc32_align is + # defined but not used. With -Werror, crc32 will not compile. + # TODO: Remove this when the upstream issue is repaired. + target_compile_options(crc32 PUBLIC -Wno-unused-function) +endif () target_include_directories(crc32 PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") diff --git a/contrib/ddsketch_calc.py b/contrib/ddsketch_calc.py new file mode 100644 index 0000000000..b113cb37dc --- /dev/null +++ b/contrib/ddsketch_calc.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# +# ddsketch_calc.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import math as m + + +# Implements a DDSketch class as desrcibed in: +# https://arxiv.org/pdf/1908.10693.pdf + +# This class has methods that use cubic interpolation to quickly compute log +# and inverse log. The coefficients A,B,C as well as correctingFactor are +# all constants used for interpolating. + +# The implementation for interpolation was originally seen here in: +# https://github.com/DataDog/sketches-java/ +# in the file CubicallyInterpolatedMapping.java + +class DDSketch(object): + A = 6.0 / 35.0 + B = -3.0 / 5.0 + C = 10.0 / 7.0 + EPS = 1e-18 + correctingFactor = 1.00988652862227438516 + offset = 0 + multiplier = 0 + gamma = 0 + + def __init__(self, errorGuarantee): + self.gamma = (1 + errorGuarantee) / (1 - errorGuarantee) + self.multiplier = (self.correctingFactor * m.log(2)) / m.log(self.gamma) + self.offset = self.getIndex(1.0 / self.EPS) + + def fastlog(self, value): + s = np.frexp(value) + e = s[1] + s = s[0] + s = s * 2 - 1 + return ((self.A * s + self.B) * s + self.C) * s + e - 1 + + def reverseLog(self, index): + exponent = m.floor(index) + d0 = self.B * self.B - 3 * self.A * self.C + d1 = 2 * self.B * self.B * self.B - 9 * self.A * self.B * self.C - 27 * self.A * self.A * (index - exponent) + p = np.cbrt((d1 - np.sqrt(d1 * d1 - 4 * d0 * d0 * d0)) / 2) + significandPlusOne = - (self.B + p + d0 / p) / (3 * self.A) + 1 + return np.ldexp(significandPlusOne / 2, exponent + 1) + + def getIndex(self, sample): + return m.ceil(self.fastlog(sample) * self.multiplier) + self.offset + + def getValue(self, idx): + return self.reverseLog((idx - self.offset) / self.multiplier) * 2.0 / (1 + self.gamma) + diff --git a/contrib/ddsketch_compare.py b/contrib/ddsketch_compare.py new file mode 100644 index 0000000000..d3b5f9942f --- /dev/null +++ b/contrib/ddsketch_compare.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# +# ddsketch_compare.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import numpy as np + + +# kullback-leibler divergence (or relative entropy) +def relative_entropy(p, q): + difference = 0.0 + for i in range(len(p)): + if p[i] != 0.0 and q[i] != 0.0: + difference += (p[i] * np.log2(p[i]/q[i])) + return difference + +# jensen-shannon divergence (or symmetric relative entropy) +def relative_entropy_symmetric(dd1, dd2): + # normalize p, q into distribution + sum1 = sum(dd1) + sum2 = sum(dd2) + + p = [dd1[i] / sum1 for i in range(len(dd1))] + q = [dd2[i] / sum2 for i in range(len(dd2))] + m = [0.5 * (p[i] + q[i]) for i in range(len(p))] + + return 0.5 * relative_entropy(p, m) + 0.5 * relative_entropy(q, m) + +# setup cmdline args +parser = argparse.ArgumentParser(description="Compares two DDSketch distributions") +parser.add_argument('--txn1', help='Transaction type for first file', required=True, type=str) +parser.add_argument('--txn2', help='Transaction type for second file', required=True, type=str) +parser.add_argument('--file1', help='Path to first ddsketch json', required=True, type=str) +parser.add_argument('--file2', help="Path to second ddsketch json'", required=True, type=str) +parser.add_argument("--op", help='Operation name', type=str) +args = parser.parse_args() + +f1 = open(args.file1) +f2 = open(args.file2) +data1 = json.load(f1) +data2 = json.load(f2) + +if data1[args.txn1][args.op]["errorGuarantee"] != data2[args.txn2][args.op]["errorGuarantee"]: + print("ERROR: The sketches have different error guarantees and cannot be compared!") + exit() + +b1 = data1[args.txn1][args.op]["buckets"] +b2 = data2[args.txn2][args.op]["buckets"] + +re = relative_entropy_symmetric(b1, b2) +print("The similarity is: ", round(re, 8)) +print("1 means least alike, 0 means most alike") \ No newline at end of file diff --git a/contrib/ddsketch_conversion.py b/contrib/ddsketch_conversion.py new file mode 100644 index 0000000000..5b9825f267 --- /dev/null +++ b/contrib/ddsketch_conversion.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# +# ddsketch_conversion.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import ddsketch_calc as dd + + +parser = argparse.ArgumentParser(description="Converts values to DDSketch buckets") +parser.add_argument('-e', '--error_guarantee', help='Error guarantee (default is 0.005)', required=False, type=float) +parser.add_argument('-v', '--value', help="Value", required=False, type=int) +parser.add_argument('-b', '--bucket', help='Bucket index', required=False, type=int) +args = parser.parse_args() + +error = 0.005 + +if args.error_guarantee is not None: + error = args.error_guarantee + +sketch = dd.DDSketch(error) + +if args.value is not None: + print("Bucket index for ", args.value) + print(sketch.getIndex(args.value)) + +if args.bucket is not None: + print("Value for bucket ", args.bucket) + print(sketch.getValue(args.bucket)) \ No newline at end of file diff --git a/contrib/export_graph.py b/contrib/export_graph.py new file mode 100644 index 0000000000..9cd369f7de --- /dev/null +++ b/contrib/export_graph.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# +# export_graph.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import matplotlib.pyplot as plt +import argparse +import ddsketch_calc as dd + +# setup cmdline args +parser = argparse.ArgumentParser(description="Graphs DDSketch distribution") +parser.add_argument('-t', '--txn', help='Transaction type (ex: g8ui)', required=True, type=str) +parser.add_argument('--file', help='Path to ddsketch json', required=True, type=str) +parser.add_argument('--title', help='Title for the graph', required=False, type=str) +parser.add_argument('--savefig', help='Will save the plot to a file if set', type=str) +parser.add_argument('--op', help='Which OP to plot (casing matters)', type=str) +args = parser.parse_args() + + +# Opening JSON file +f = open(args.file) +data = json.load(f) + +# parse json and init sketch +buckets = data[args.t][args.op]["buckets"] +error = data[args.t][args.op]["errorGuarantee"] +sketch = dd.DDSketch(error) + +# trim the tails of the distribution +ls = [i for i, e in enumerate(buckets) if e != 0] +actual_data = buckets[ls[0]:ls[-1]+1] +indices = range(ls[0], ls[-1]+1) +actual_indices = [sketch.getValue(i) for i in indices] + +# configure the x-axis to make more sense +fig, ax = plt.subplots() +ax.ticklabel_format(useOffset=False, style='plain') +plt.plot(actual_indices, actual_data) +plt.xlabel("Latency (in us)") +plt.ylabel("Frequency count") + +plt_title = "Title" +if args.title is not None: + plt_title = args.title +plt.title(plt_title) +plt.xlim([actual_indices[0], actual_indices[-1]]) +if args.savefig is not None: + plt.savefig(args.savefig, format='png') +else: + plt.show() \ No newline at end of file diff --git a/contrib/libb64/CMakeLists.txt b/contrib/libb64/CMakeLists.txt new file mode 100644 index 0000000000..1ef665f079 --- /dev/null +++ b/contrib/libb64/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(libb64 STATIC cdecode.c cencode.c) +target_include_directories(libb64 PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") diff --git a/fdbclient/libb64/cdecode.c b/contrib/libb64/cdecode.c similarity index 56% rename from fdbclient/libb64/cdecode.c rename to contrib/libb64/cdecode.c index f10ecc3dcc..5a833ab689 100644 --- a/fdbclient/libb64/cdecode.c +++ b/contrib/libb64/cdecode.c @@ -5,18 +5,18 @@ This is part of the libb64 project, and has been placed in the public domain. For details, see http://sourceforge.net/projects/libb64 */ -#include "fdbclient/libb64/cdecode.h" +#include "libb64/cdecode.h" -int base64_decode_value(char value_in) { - static const char decoding[] = { 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -2, -1, - -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 }; - static const char decoding_size = sizeof(decoding); +int base64_decode_value(int value_in) { + static const int decoding[] = { 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -2, -1, + -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 }; + static const int decoding_size = sizeof(decoding) / sizeof(decoding[0]); value_in -= 43; - if (value_in < 0 || value_in > decoding_size) + if (value_in < 0 || value_in >= decoding_size) return -1; - return decoding[(int)value_in]; + return decoding[value_in]; } void base64_init_decodestate(base64_decodestate* state_in) { @@ -27,7 +27,7 @@ void base64_init_decodestate(base64_decodestate* state_in) { int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in) { const char* codechar = code_in; char* plainchar = plaintext_out; - char fragment; + int fragment = 0; *plainchar = state_in->plainchar; @@ -40,9 +40,9 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar = (fragment & 0x03f) << 2; + *plainchar = (char)((fragment & 0x03f) << 2); case step_b: do { if (codechar == code_in + length_in) { @@ -50,10 +50,10 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar++ |= (fragment & 0x030) >> 4; - *plainchar = (fragment & 0x00f) << 4; + *plainchar++ |= (char)((fragment & 0x030) >> 4); + *plainchar = (char)((fragment & 0x00f) << 4); case step_c: do { if (codechar == code_in + length_in) { @@ -61,10 +61,10 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar++ |= (fragment & 0x03c) >> 2; - *plainchar = (fragment & 0x003) << 6; + *plainchar++ |= (char)((fragment & 0x03c) >> 2); + *plainchar = (char)((fragment & 0x003) << 6); case step_d: do { if (codechar == code_in + length_in) { @@ -72,9 +72,9 @@ int base64_decode_block(const char* code_in, const int length_in, char* plaintex state_in->plainchar = *plainchar; return plainchar - plaintext_out; } - fragment = (char)base64_decode_value(*codechar++); + fragment = base64_decode_value(*codechar++); } while (fragment < 0); - *plainchar++ |= (fragment & 0x03f); + *plainchar++ |= (char)((fragment & 0x03f)); } } /* control should not reach here */ diff --git a/fdbclient/libb64/cencode.c b/contrib/libb64/cencode.c similarity index 98% rename from fdbclient/libb64/cencode.c rename to contrib/libb64/cencode.c index 7999b47d61..85e679c7c8 100644 --- a/fdbclient/libb64/cencode.c +++ b/contrib/libb64/cencode.c @@ -5,7 +5,7 @@ This is part of the libb64 project, and has been placed in the public domain. For details, see http://sourceforge.net/projects/libb64 */ -#include "fdbclient/libb64/cencode.h" +#include "libb64/cencode.h" const int CHARS_PER_LINE = 72; diff --git a/fdbclient/include/fdbclient/libb64/cdecode.h b/contrib/libb64/include/libb64/cdecode.h similarity index 93% rename from fdbclient/include/fdbclient/libb64/cdecode.h rename to contrib/libb64/include/libb64/cdecode.h index 26d5873f22..04655b3a95 100644 --- a/fdbclient/include/fdbclient/libb64/cdecode.h +++ b/contrib/libb64/include/libb64/cdecode.h @@ -17,7 +17,7 @@ typedef struct { void base64_init_decodestate(base64_decodestate* state_in); -int base64_decode_value(char value_in); +int base64_decode_value(int value_in); int base64_decode_block(const char* code_in, const int length_in, char* plaintext_out, base64_decodestate* state_in); diff --git a/fdbclient/include/fdbclient/libb64/cencode.h b/contrib/libb64/include/libb64/cencode.h similarity index 100% rename from fdbclient/include/fdbclient/libb64/cencode.h rename to contrib/libb64/include/libb64/cencode.h diff --git a/fdbclient/include/fdbclient/libb64/decode.h b/contrib/libb64/include/libb64/decode.h similarity index 98% rename from fdbclient/include/fdbclient/libb64/decode.h rename to contrib/libb64/include/libb64/decode.h index 84a3998d7a..696e3f1a4d 100644 --- a/fdbclient/include/fdbclient/libb64/decode.h +++ b/contrib/libb64/include/libb64/decode.h @@ -9,6 +9,7 @@ For details, see http://sourceforge.net/projects/libb64 #define BASE64_DECODE_H #include +#include "libb64/encode.h" namespace base64 { extern "C" { diff --git a/fdbclient/include/fdbclient/libb64/encode.h b/contrib/libb64/include/libb64/encode.h similarity index 100% rename from fdbclient/include/fdbclient/libb64/encode.h rename to contrib/libb64/include/libb64/encode.h diff --git a/contrib/md5/CMakeLists.txt b/contrib/md5/CMakeLists.txt new file mode 100644 index 0000000000..317065b5c1 --- /dev/null +++ b/contrib/md5/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(md5 STATIC md5.c) +target_include_directories(md5 PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") diff --git a/fdbclient/include/fdbclient/md5/md5.h b/contrib/md5/include/md5/md5.h similarity index 100% rename from fdbclient/include/fdbclient/md5/md5.h rename to contrib/md5/include/md5/md5.h diff --git a/fdbclient/md5/md5.c b/contrib/md5/md5.c similarity index 100% rename from fdbclient/md5/md5.c rename to contrib/md5/md5.c diff --git a/contrib/observability_splunk_dashboard/details.xml b/contrib/observability_splunk_dashboard/details.xml new file mode 100644 index 0000000000..70ff15883b --- /dev/null +++ b/contrib/observability_splunk_dashboard/details.xml @@ -0,0 +1,431 @@ +
+ + Details for FoundationDB Cluster +
+ + + * + + + + * + + + + + -60m@m + now + + + + + Default + 5 seconds + 1 minute + 10 minutes + 1 hour + 1 day + bins=100 + bins=100 + + + + All + Storage Server + Transaction Log + Proxy + Resolver + Master + Cluster Controller + Log Router + Data Distributor + Ratekeeper + Tester + + + + + * + + + + * + +
+ + + + Storage Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | rex field=BytesDurable "(?<DurableRate>.*) (?<DurableRoughness>.*) (?<DurableCounter>.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Input Rate + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | timechart $Span$ avg(InputRate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Bytes Queried + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?<Rate>.*) (?<Roughness>.*) (?<Counter>.*)" | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Average Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Max Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Disk Busyness + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Max Run Loop Busyness by Role (for <=6.1, S2Pri1) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Max Run Loop Busyness by Priority (6.2+ only) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness<<MATCHSTR>>=PriorityBusy<<MATCHSTR>>/Elapsed] | timechart $Span$ max(Busyness*) + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + TLog Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Connection Timeouts (counted on both sides of connection) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?<OtherAddr>[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Pairwise Connection Timeouts Between Datacenters + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) host=* Machine=* NOT TrackLatestType=Rolled +| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) +| rex field=host "(?<Datacenter>..).*" +| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter) +| rex field=WithAddr "(?<OtherIP>[^:]*):.*" +| join OtherIP + [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled + | rex field=Machine "(?<OtherIP>[^:]*):.*" + | rex field=host "(?<OtherDatacenter>..).*" + | eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)] +| eval DC1=if(Datacenter>OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter>OtherDatacenter, OtherDatacenter, Datacenter) +| eval Connection=DC1+" <-> " + DC2 +| eval Count=1+SuppressedEventCount +| timechart count by Connection + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?<IP>[^:]*):.*" | rex field=host "(?<Datacenter>..).*" | rex field=WithAddr "(?<OtherIP>[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?<OtherIP>[^:]*):.*" | rex field=host "(?<OtherDatacenter>..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP>OtherIP, IP, OtherIP), Addr2=if(IP>OtherIP, OtherIP, IP) | eval Connection=Addr1+" <-> " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Lazy Deletion Rate (making space available for reuse) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Vacuuming Rate (shrinking file) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Roles + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + + Slow Tasks (Sorted by Duration, Descending) + + index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+ + + Event Counts (Sorted by Severity and Count, Descending) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Errors + + index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+
+ + + + Recoveries (Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan<RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan < RecoveryResetInterval)" | where NOT RecoverySpan < RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Process (Re)starts + + index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Failure Detection (Machine Filter Only) + + index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + +
+ + + + Storage Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + TLog Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent, latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Data Movement by Type (Log Scale, Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as * + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Storage Server Max Bytes Stored by Host + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100 + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Master Failed Clients + + index=$Index$ LogGroup=$LogGroup$ Type=WaitFailureClient +| stats count by FailedEndpoint + $TimeRange.earliest$ + $TimeRange.latest$ + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/performance_overview.xml b/contrib/observability_splunk_dashboard/performance_overview.xml new file mode 100644 index 0000000000..0719e2bbab --- /dev/null +++ b/contrib/observability_splunk_dashboard/performance_overview.xml @@ -0,0 +1,323 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 60s + +
+ + + Transaction Rate measured on Proxies + + Sum in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0) +| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Read Rate measured on Storage Servers + + Average in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" +| rex field=RowsQueried "(?<KRate>.*) (?<KRoughness>.*) (?<KCounter>.*)" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Proxies + + 1min Average + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " MutationBytes +| makemv delim=" " Mutations +| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0) +| bucket span=5s _time +| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time +|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000 +| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Storage Servers + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + GRV Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Commit Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=CommitLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Read Latency measured on all Storage Servers + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=ReadLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + RateKeeper: ReleasedTPS vs LimitTPS + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + RateKeeper: Throttling Reason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + RateKeeper: Throttling Server + + Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records) + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original" +| streamstats count as numOfEvents +| where numOfEvents < 10 +| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S") +| table DateTime, ReasonServerID + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+
+ + + Disk Overhead = Disk Usage / Logical KV Size + + Y-axis is capped at 10 + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time +| eval overhead=StorageDiskUsedBytes/LogicalKVBytes +| timechart avg(overhead) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + KV Data Size + + + index=$Index$ LogGroup=$LogGroup$ +Roles=*DD* host=* Machine=* Type=DDTrackerStats TrackLatestType=Original +| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024 +|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Disk Usage + + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* Type=StorageMetrics TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time +|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024 +| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Cluster Roles + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original" +| rex field=host "(?<HostDC>..).*-..(?<HostConfig>..).*" +| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC) +| makemv delim="," Roles +| stats dc(Machine) as MachineCount by Roles, HostDC +| stats list(HostDC), list(MachineCount) by Roles +| sort Roles + $TimeSpan.earliest$ + $TimeSpan.latest$ + + +
+
+
+ + + Storage Engine + + + index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2 + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+ + Cluster Generations + + Indicate FDB recoveries + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + +
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/ratekeeper.xml b/contrib/observability_splunk_dashboard/ratekeeper.xml new file mode 100644 index 0000000000..c4a31a8fbc --- /dev/null +++ b/contrib/observability_splunk_dashboard/ratekeeper.xml @@ -0,0 +1,928 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 30s + + + + Yes + No + + + + + MasterServer + MasterProxyServer + StorageServer + TLog + Resolver + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + +
+ + + Aggregated Storage Server Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" + | rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" + | rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" + | rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" + | bin span=5s _time + | stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time + | eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024 + |timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Aggregated Proxy Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0) +| bin span=60s _time +| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time +| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000 +| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0) +| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 3: RKOverview - RKLimitReason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + + Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkSSListFetchTimeout" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkTlogMinFreeSpaceZero" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original" +| timechart span=1s count by Type + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval NonDurableVersions=Version-DurableVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -> lower LocalRate) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(LocalRate) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -> higher probability of rejecting read)) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval SSFallBehindVersions=VersionLag +| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable +| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput +| makemv delim=" " BytesDurable +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 19: Runtime Monitoring - Proxy Throughput + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 20: Runtime Monitoring - Proxy Queue Length + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 21: Runtime Monitoring - TLog UnpoppedVersion + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall) + + + index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics" +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer" + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 23: Runtime Monitoring - StorageServer Query Queue Length + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin +| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time ID TimeSpan Machine Location Time +| bin bins=20 span=$StatsReadSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Machine + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count>=2 +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1 +| table _time TimeSpan Machine +| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Order Type ID Location Machine Roles +| sort 0 Order Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1 +| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time TimeSpan +| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count=7 +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1 +| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Step + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime +| search Count=4 +| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei +| table _time Step Duration Machinei Location Machine MachineStep +| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization=CPUSeconds/Elapsed +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = ResidentMemory/Memory +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 35: Machine Performance - Disk (Reads Count and Writes Count) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 36: Network Performance - Timeout + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 37: Network Performance - PingLatency + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=PingLatency) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency) $PingLatencyByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + +
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/recovery.xml b/contrib/observability_splunk_dashboard/recovery.xml new file mode 100644 index 0000000000..6ba6b9a63b --- /dev/null +++ b/contrib/observability_splunk_dashboard/recovery.xml @@ -0,0 +1,873 @@ +
+ +
+ + + Table 1: Find long recovery (Input Index and LogGroup and Select a time span). + + + * + + + + + + + + + -0s + now + + + + + index=$IndexForOverview$ LogGroup=$LogGroupForOverview$ + ((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup +| search NOT ErrorDescription="Success" +| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits") +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| fillnull value="-" +| sort -Time +| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0) +| stats list(*) by ID Machine ifMasterTerminatedEvent +| rename list(*) as * +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| sort -Time +| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime +| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup + $time_token_for_recoveryhistorytable.earliest$ + $time_token_for_recoveryhistorytable.latest$ + + + + +
+
+
+ + + Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts) + + + * + + + + + + + + -0s@s + now + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled") +| rename ID as MasterID +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode +| fillnull value="-" ErrorDescription Reason ErrorCode +| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode) +| fillnull value="-" StatusCode +| sort 0 -Time -StatusCode +| stats list(*) by MasterID Machine +| rename list(*) as * +| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime +| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time +| sort -MyRecoveryCount +| fillnull value="-" MyRecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B. + + + index=$Index$ LogGroup=$LogGroup$ + Type="WaitFailureClient" +| table Type Time Machine FailedEndpoint +| replace *:tls with * in FailedEndpoint +| join Machine type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End" + | eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | rename As as Role + | table ID EndTime Machine Role] +| join FailedEndpoint type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" + | stats latest(*) by ID | rename latest(*) as * + | rename Machine as FailedEndpoint + | eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(*) by FailedEndpoint + | rename list(*) as * + | table FailedEndpoint FailedEndpointLatestRoleEventInfo] +| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| makemv delim=" " FailedEndpointLatestRoleEventInfo +| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 4: New Recruitment Configuration (using MasterRecoveredConfig event) + + + index=$Index$ LogGroup=$LogGroup$ + Type="MasterRecoveredConfig" AND TrackLatestType="Original" +| eval Configuration=replace(Conf, "&quot;", "\"") +| rename Configuration as _raw + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + Table 5: Data Centers (using ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics +| dedup DCID +| rename DCID as DataCenterID +| table DataCenterID pie_work_unit +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 6: New Role (using Role event joined by ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search count=1 AND Transition="Begin" +| table ID Role Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter +| fillnull value="null" DataCenter +| stats count by Role DataCenter + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 7: Role Details + + + MasterServer + TLog + Resolver + MasterProxyServer (for <7.0) + LogRouter + CommitProxyServer (for 7.0+) + GrvProxyServer (for 7.0+) + As=" + " + OR + + + + Begin + End + Begin->End + count=1 AND Transition="Begin" + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search $RoleDetailTableWhichRoleToken$ +| table ID Role Machine Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter Time +| fillnull value="null" DataCenter +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Role Machine DataCenter DateTime +| sort 0 -DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration) + + + index=$Index$ LogGroup=$LogGroup$ + Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError" + | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-") + | table Type GoodRecruitmentTimeReady Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + +
+
+
+ + + Table 9: RecoveryCount of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore") +| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old." + -7d@h + now + + + +
+
+ + Table 10: Which roles the selected TLog (in Table 11) talks to + + + index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| table TLogID TLogEvents +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2) +| fields - temp - TLogEvents +| sort 0 -Time +| search NOT MasterID="NULL" +| dedup MasterID +| rename MasterID as ID +| join type=left ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role") + | sort 0 -Time + | dedup ID + | table ID Machine As] +| table ID Machine As | fillnull value="null" Machine As + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$ +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null") +| stats list(TLogEvents) by TLogID +| rename list(TLogEvents) As TLogEvents +| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0) +| table TLogID TLogEvents EarliestEvent LatestEvent +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND As="TLog") + | sort 0 -Time + | dedup ID + | rename ID as TLogID + | table TLogID host LogGroup Machine] +| table TLogID Machine LogGroup host EarliestEvent LatestEvent +| fillnull value="null" Machine host LogGroup +| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime +| table TLogID Machine EarliestTime Duration LogGroup host +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="null" DataCenter +| table TLogID Machine DataCenter EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled") + | sort -Time + | eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") + | stats list(*) by TLogID + | rename list(*) As * + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | sort TLogID + | table TLogID TLogEvents + | mvexpand TLogEvents + | eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2) + | fields - temp - TLogEvents + | sort 0 -Time + | search NOT RoleID="NULL" + | table TLogID RoleID MasterMachine + | stats list(*) by TLogID + | rename list(*) as * + | streamstats count + | mvexpand RoleID + | dedup count RoleID + | fields - count + | stats count by TLogID + | rename count as Roles + | table TLogID Roles] +| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled")) + | sort -Time + | eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type) + | sort 0 TLogEvents + | stats list(TLogEvents) by TLogID + | rename list(TLogEvents) As TLogEvents + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | mvcombine delim=" " TLogEvents + | table TLogID TLogEvents] +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup +| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogStart") OR (Type="TLogPersistentStateRestore") + | eval TLogID=if(Type="TLogStart", ID, LogId) + | table TLogID RecoveryCount] +| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| fillnull value="TLog too old, click and see details" RecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + $click.value$ + +
+
+ + Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$) + | dedup ID + | rename ID as TLogID + | table TLogID Machine] +| table TLogID Machine TLogEvents +| fillnull value="-" Machine +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3) +| fields - temp - TLogEvents +| join type=left + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role") + | dedup ID + | rename ID as ToID + | rename As as ToRole + | rename Machine as ToMachine + | table ToID ToRole ToMachine] +| sort 0 -Time +| fillnull value="-" ToRole ToMachine +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| rename Tags as UnpoppedRecoveredTagCount +| rename Tag as TagPopped +| rename DurableKCVer as DurableKnownCommittedVersion +| search TagPopped!="-1:2" +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt +| sort 0 -UnpoppedRecoveredTagCount +| join TagPopped type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | rename Tag as TagPopped + | table TagPopped ID Machine] +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| rename ID as SSID +| rename Machine as SSMachine +| rename DataCenter as SSDataCenter +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+ + Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| table TLogID Type AllTags Locality +| makemv delim="," AllTags +| mvexpand AllTags +| rename AllTags as Tag | sort 0 Tag +| join Tag type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | table Tag ID Machine] +| table TLogID Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="-" +| table TLogID Tag ID Machine DataCenter +| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter +| search Tag!="-1:2" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong) + + + | set diff + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") + | table AllTags + | makemv delim="," AllTags + | mvexpand AllTags + | rename AllTags as Tag + | table Tag] + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") + | table Tag] + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 16: All Current Storage Servers (assume each machine has at most one SS) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") AND $TriggerSSTableToken$ +| stats latest(*) by Machine +| rename latest(*) as * +| table Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Machine DataCenter Tag +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled")) + | stats latest(*) by Machine + | rename latest(*) as * + | rename As as Role + | table ID Role Machine + | join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] + | table ID Role Machine DataCenter + | fillnull value="null" DataCenter] +| sort 0 DataCenter +| table Tag ID Machine DataCenter | sort 0 Tag + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 1: Timeout/TimedOut event distribution grouped by source (Machine) + + + 5s + + + + TLog + MasterServer + MasterProxyServer (for version < 7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for ver 7+) + As=" + " + OR + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| stats count as TotalTimeouts by Machine PeerAddr +| table Machine PeerAddr TotalTimeouts +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS MachineRoleLatestEvent BY Machine + ] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS PeerRoleLatestEvent BY Machine + | rename Machine AS PeerAddr + ] +| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 18: Proxy 0 + + + index=$Index$ LogGroup=$LogGroup$ + (Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True" +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table WorkerID LogGroup FirstProxy Time DateTime +| sort 0 -Time +| join type=left WorkerID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND As="Worker" AND Transition="Refresh" + | dedup ID + | rename ID as WorkerID + | stats list(*) by WorkerID + | rename list(*) as * + | table WorkerID Machine Roles] +| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh" + | dedup ID + | rename ID as ProxyID + | table Machine ProxyID] +| table ProxyID Machine LogGroup FirstProxy + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500) + + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND Machine=$SearchMachineToken$ +| stats latest(*) by ID Transition +| rename latest(*) as * +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason +| sort 0 -DateTime +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 3: severity>=20 event distribution (including roles that refresh/begin/end in the timespan) + + + * + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + EventType + Machine + Severity + Type + + + + 5s + + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 AND $BadEvents$ +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID | table Machine] +| table Machine Type Severity _time +| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$ + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 20: Check severity>20 events of roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 +| stats count by Machine Type +| rename count as Count +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID + | eval Role=As."-".ID + | stats list(Role) by Machine + | rename list(Role) as Roles + | table Machine Roles] +| table Type Count Roles Machine +| sort -Count + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/transaction_latency.xml b/contrib/observability_splunk_dashboard/transaction_latency.xml new file mode 100644 index 0000000000..99b551f2c9 --- /dev/null +++ b/contrib/observability_splunk_dashboard/transaction_latency.xml @@ -0,0 +1,247 @@ +
+ + Design for ClusterController issued transactions. +
+ + + + + + + * + + + + * + + + + + @d + now + + +
+ + + All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit). + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$ + (Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID") +| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To) +| stats list(To) by ID +| rename list(To) as ToList +| table ID ToList +| eval Count = mvcount(ToList) +| search Count=3 +| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1)) +| table ID GrvID ReadID CommitID +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before") + | rename ID as GrvID + | rename Time as BeginTime + | table GrvID BeginTime + ] +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After") + | rename ID as GrvID + | rename Time as GRVDoneTime + | table GrvID GRVDoneTime + ] +| join ReadID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="GetValueDebug" AND Location="NativeAPI.getValue.After") + | rename ID as ReadID + | rename Time as ReadDoneTime + | table ReadID ReadDoneTime + ] +| join CommitID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="CommitDebug" AND Location="NativeAPI.commit.After") + | rename ID as CommitID + | rename Time as CommitDoneTime + | table CommitID CommitDoneTime + ] +| rename ID as TransactionID +| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration + $time_token.earliest$ + $time_token.latest$ + + + + $row.BeginTimeScope$ + $row.EndTimeScope$ + $row.ReadID$ + $row.GrvID$ + $row.CommitID$ + +
+
+
+ + + Step1: GRV + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion) +AND (ID=$GrvID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To]) +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Delta Order Type ID Location Machine Roles +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+ + Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events) + + only for FDB 6.3 + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion" + AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To] +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime +| sort 0 -Time +| table Machine Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step2: GetValue + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$ +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (ID=$CommitID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To]) + +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step3: Commit --- Resolver + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="Resolver*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8) +| sort 0 Time Order +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table Machine DataCenter Roles Duration Location Delta Time + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="TLog*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| sort 0 Time +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time + $BeginTime$ + $EndTime$ + + + +
+
+
+
\ No newline at end of file diff --git a/contrib/pkg_tester/test_fdb_pkgs.py b/contrib/pkg_tester/test_fdb_pkgs.py index 08ccd35aa6..178f84d93c 100644 --- a/contrib/pkg_tester/test_fdb_pkgs.py +++ b/contrib/pkg_tester/test_fdb_pkgs.py @@ -165,7 +165,6 @@ def centos_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]: container = Container("centos:7", initd=True) for rpm in rpms: container.copy_to(rpm, "/opt") - container.run(["bash", "-c", "yum update -y"]) container.run( ["bash", "-c", "yum install -y prelink"] ) # this is for testing libfdb_c execstack permissions @@ -327,7 +326,7 @@ def test_execstack_permissions_libfdb_c(linux_container: Container, snapshot): [ "bash", "-c", - "execstack -q $(ldconfig -p | grep libfdb_c | awk '{print $(NF)}')", + "execstack -q $(ldconfig -p | grep libfdb_c.so | awk '{print $(NF)}')", ] ) diff --git a/contrib/sqlite/sqlite3.amalgamation.c b/contrib/sqlite/sqlite3.amalgamation.c index 2b0058be91..2992e521e2 100644 --- a/contrib/sqlite/sqlite3.amalgamation.c +++ b/contrib/sqlite/sqlite3.amalgamation.c @@ -87009,7 +87009,7 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( } sqlite3_query_plan[nQPlan] = 0; nQPlan = 0; -#endif /* SQLITE_TEST // Testing and debugging use only */ +#endif /* SQLITE_TEST // Testing and debugging use only */"); /* Record the continuation address in the WhereInfo structure. Then ** clean up and return. diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index e704cacb72..79534596b5 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -49,7 +49,7 @@ PROTOCOL_VERSION_6_1 = 0x0FDB00B061060001 PROTOCOL_VERSION_6_2 = 0x0FDB00B062010001 PROTOCOL_VERSION_6_3 = 0x0FDB00B063010001 PROTOCOL_VERSION_7_0 = 0x0FDB00B070010001 -PROTOCOL_VERSION_7_1 = 0x0FDB00B071010001 +PROTOCOL_VERSION_7_1 = 0x0FDB00B071010000 PROTOCOL_VERSION_7_2 = 0x0FDB00B072000000 supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_6_0, PROTOCOL_VERSION_6_1, PROTOCOL_VERSION_6_2, PROTOCOL_VERSION_6_3, PROTOCOL_VERSION_7_0, @@ -194,7 +194,7 @@ class BaseInfo(object): if protocol_version >= PROTOCOL_VERSION_6_3: self.dc_id = bb.get_bytes_with_length() if protocol_version >= PROTOCOL_VERSION_7_1: - if bb.get_bytes(1): + if bb.get_bool(): self.tenant = bb.get_bytes_with_length() class GetVersionInfo(BaseInfo): @@ -244,6 +244,11 @@ class CommitInfo(BaseInfo): self.read_snapshot_version = bb.get_long() if protocol_version >= PROTOCOL_VERSION_6_3: self.report_conflicting_keys = bb.get_bool() + + if protocol_version >= PROTOCOL_VERSION_7_1: + lock_aware = bb.get_bool() + if bb.get_bool(): + spanId = bb.get_bytes(16) class ErrorGetInfo(BaseInfo): @@ -279,6 +284,12 @@ class ErrorCommitInfo(BaseInfo): if protocol_version >= PROTOCOL_VERSION_6_3: self.report_conflicting_keys = bb.get_bool() + if protocol_version >= PROTOCOL_VERSION_7_1: + lock_aware = bb.get_bool() + if bb.get_bool(): + spanId = bb.get_bytes(16) + + class UnsupportedProtocolVersionError(Exception): def __init__(self, protocol_version): super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version) diff --git a/contrib/tsan.suppressions b/contrib/tsan.suppressions new file mode 100644 index 0000000000..2078f7e8c6 --- /dev/null +++ b/contrib/tsan.suppressions @@ -0,0 +1,5 @@ +# ThreadSanitizer suppressions file for FDB +# https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions + +# FDB signal handler is not async-signal safe +signal:crashHandler diff --git a/design/LoadBalancing/LoadBalancing.md b/design/LoadBalancing/LoadBalancing.md new file mode 100644 index 0000000000..5100437825 --- /dev/null +++ b/design/LoadBalancing/LoadBalancing.md @@ -0,0 +1,227 @@ +# Load Balancing in FoundationDB + +## Introduction + +FoundationDB is a distributed key-value database. A FoundationDB cluster is constituted by one or more processes over one or more physical machines, where each process is a *worker* and takes certain *role*s, such as coordinator, proxy, TLog, storage server, etc., in the system. + +The interpocess communications (IPC) between the processes are supported by the [`flow`](https://github.com/apple/foundationdb/tree/main/flow) infrastructure. In the `flow` context, each process will expose one or more *interface*(s). Each interface is able to accept given type of *request*s, and *reply* `Void`, requested data or error. The interfaces and the corresponding request/reply pairs forms the IPC protocol of FoundationDB. + +In many cases, the same request can be proceed by multiple processes, e.g. all commit proxies can accept commit requests, and multiple storage server processes can provide values for a given key in double/triple redundancy mode. A load balancer (LB) can be used to distribute the requests over the possible interfaces, preventing one or a few processes getting overloaded. The interface candidates are also referred as *alternative*s. The LB is also able to react when one or more interfaces are (temporarily) unavailable by retrying, or re-routing the request to other candidates. The interface candidates are also known as *alternative*s. + +Two LBs are provided in FoundationDB: `basicLoadBalance` and `loadBalance`, both defined in [`LoadBalance.actor.h`](https://github.com/apple/foundationdb/blob/main/fdbrpc/include/fdbrpc/LoadBalance.actor.h). The `basicLoadBalance` is a simple load balancer which each interface is equally chosen; while the `loadBalance` accepts a model object, which provides [datacenter](https://apple.github.io/foundationdb/configuration.html#configuring-regions) (DC) awaring balancing algorithms, allowing requests being sent to interfaces in the same DC. + +In the following sections, the two LBs will be discussed in details. + +## `basicLoadBalance` + +`basicLoadBalance` implements a simple load balancing algorithm. It applies to + +* Commit proxy interface +* GetReadVersion proxy interface +* ConfigFollower interface + +Here, the interfaces are assumed to be always *fresh*, i.e. the list of the servers is fixed. + +```mermaid +graph LR + H0{Has alternatives?} + H1[Pick an alternative] + H2[Backoff] + H3[Request] + H4([Reply]) + H5([Error]) + H6([Never]) + H((Start)) --> H0 + H0 --No--> H6 + H0 --Yes--> H1 + H1 --No healthy alternatives--> H2 --Retry--> H1 + H1 --Has alternative--> H3 --Success--> H4 + H3 --Exception--> H5 + H3 --Broken Promise --> H2 +``` + +### Alternative pick algorithm + +In `basicLoadBalance`, a *best* alternative is picked and used at the beginning. At this stage, this alternative is randomly picked among all alternatives. If the best alternative does not work, it will iteratively try other interfaces, see [here](#picking-up-an-alternative-in-basic-load-balancing-algorithm). + +## `loadBalance` + +`loadBalance` provides a more sophisticated implementation of load balancing. In addition of the basic load balancing, it also provides a variety of features: + +* Support for Test Storage Server ([TSS](https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/tss.rst)) +* Datacenter awaring alternative election +* Recording the latency and penalty from interfaces, and [prioritize the interfaces based on previously stored data](#with-queuemodel). +* Able to handle timeouts and SS exceptions with retries. + +Currently it is used for + +* Storage Server interface +* BlobWorker interface + + + +```mermaid +graph LR + H((Start)) + H0{Has alternatives?} + H1[Choose initial candidates] + H4([Never]) + H5[pick an alternative] + H6[Send request] + H7[Wait for available alternative] + H8([Response]) + H9([All alternatives failed]) + + H --> H0 --No--> H4 + H0 --Yes--> H1 + H1 --> H5 + H5 --Has alternative--> H6 + H5 --No alternative-->H7 + H6 --Success--> H8 + H6 --Failure--> H5 + H7 --At least one alternative--> H5 + H7 --> H9 +``` + +Note: + +* The response could be either a reply, or an `Error`, e.g. `process_behind` or `request_maybe_delivered`. + +### Choose initial candidates + +Two initial candidates will be picked before the requests start. They will be selected as the first two alternatives for the load balancer. If both of them failed, other alternatives are used in a round-robin way. + +#### No `QueueModel` + +If no `QueueModel` is provided, the initial candidates are picked randomly. The first candidate, or the *best* alternative, will be the one that in the same DC, if possible. + +#### With `QueueModel` + +`QueueModel` holds information about each candidate related to future version, latency and penalty. + +* If the storage server is returning a future version error, it is marked as not available until some certain time. +* Penalty is reported by storage server in each response (see `storageserver.actor.cpp:StorageServer::getPenalty`). It is determined by the write queue length and the durability lagging. + +If `QueueModel` exists, the candidates will be picked base on the penalty. Workers with high penalties will be avoided when picking the first two candidates. + +### Pick an alternative + +The alternatives are chosen in the round-robin way when the first two candidates failed. If all alternatives failed, a flag is set, and if the next request fails with `process_behind`, the caller will receive the `process_behind` error. + +### Send requests to workers + +Here it is assumed that there are at least one alternative available. If no alternative is available, the LB will wait. + +```mermaid +graph LR + H((start)) + H0{Is first request} + H1[Send first request] + H2([Response]) + H3[Pick up next alternative] + H4[Send additional request] + + H --> H3 + H3 -->H0 + H0 --Yes--> H1 + H1 --Success--> H2 + H1 --Timeout--> H3 + H0 --No--> H4 + H4 --First request succeed--> H2 + H4 --Second request succeed--> H2 + H4 --Additional request failed--> H3 +``` + +The first request has a timeout option. If the LB is not able to retrieve the response within the timout, more requests will be sent to secondary and other available interfaces. If the first request failed, it is reset and the next request will be considered as the first request. Certain types of errors can also be returned as response, e.g. `request_may_be_delivered` or `process_behind`, which may not trigger a load-balancer retry. + +### Wait for available alternative + +When there is no alternatives available, the load balancer may wait until at least one interface is up. + +```mermaid +graph LR + H0((start)) + H1{Is first request in-flight} + H2[Wait for the first request] + H3([Response]) + H4([Retry]) + H5[Wait for alternatives] + H6([all_alternatives_failed]) + + H0 --> H1 + H1 --Yes--> H2 + H1 --No--> H5 + H5 --Timeout-->H6 + H5 --Success-->H4 + H2 --Success-->H3 + H2 --Failed-->H4 +``` + +Note that "Wait for alternatives" will only timeout if the alternatives are always not fresh, i.e. this only happens when accessing storage servers. LB will throw `all_alternatives_failed` when timeout in this case. + +#### Requests + +Original requests in `loadBalancer` are wrapped by `LoadBalance.actor.h:RequestData`. It provides the following additional operations besides the original `flow` request: + +* TSS support if `QueueModel` is available +* Translate some errors into `maybe_delivered`, `process_behind` or retries +* Update the `QueueModel` information including latency, penalty, etc. + +## Appendix + +### Picking an alternative in basic load balancing algorithm + +The following script simulates the alternative picking up algorithm. The chosen alternatives will be printed out one-by-one. The `loadBalance` function uses a similar approach, though the interfaces in the same DC are used firstly. + +```python +#! /usr/bin/env python3 + +import random +import time + + +class Alternatives: + + def __init__(self, num_alternatives): + self._size = num_alternatives + + def size(self): + return self._size + + def get_best(self): + return random.randint(0, self._size - 1) + + +# Entry +NUM_ALTERNATIVES = 10 +alts = Alternatives(NUM_ALTERNATIVES) + +best_alt = alts.get_best() +next_alt = random.randint(0, alts.size() - 2) +if next_alt >= best_alt: + next_alt += 1 +start_alt = next_alt +start_distance = (best_alt + alts.size() - start_alt) % alts.size() +use_alt = None + +print("best_alt = {}".format(best_alt)) +print("start_alt = {}".format(start_alt)) +print("start_distance = {}".format(start_distance)) + +while True: + for alt_num in range(0, alts.size()): + use_alt = next_alt + if next_alt == start_alt: + print(" Going back to the start_alt") + use_alt = best_alt + elif (next_alt + alts.size() - start_alt) % alts.size() <= start_distance: + print(" Entering start_distance") + use_alt = (next_alt + alts.size() - 1) % alts.size() + + print("Attempting alt: {}".format(use_alt)) + + # Next loop + next_alt = (next_alt + 1) % alts.size() + time.sleep(.2) +``` + diff --git a/design/LoadBalancing/LoadBalancing.pdf b/design/LoadBalancing/LoadBalancing.pdf new file mode 100644 index 0000000000..a0ef91dbd7 Binary files /dev/null and b/design/LoadBalancing/LoadBalancing.pdf differ diff --git a/design/backup-dataFormat.md b/design/backup-dataFormat.md index 73942e41ef..f6f9a0338c 100644 --- a/design/backup-dataFormat.md +++ b/design/backup-dataFormat.md @@ -54,7 +54,7 @@ NOTE: All blocks except for the final block will have one last value which will The code related to how a range file is written is in the `struct RangeFileWriter` in `namespace fileBackup`. -The code that decodes a range block is in `ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len)`. +The code that decodes a range block is in `ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len, Database cx)`. ### Data format in a log file diff --git a/design/data-distributor-internals.md b/design/data-distributor-internals.md index 4e67534dfa..ccaba537b6 100644 --- a/design/data-distributor-internals.md +++ b/design/data-distributor-internals.md @@ -20,7 +20,7 @@ Data distribution manages the lifetime of storage servers, decides which storage **RelocateShard (`struct RelocateShard`)**: A `RelocateShard` records the key range that need to be moved among servers and the data movement’s priority. DD always move shards with higher priorities first. -**Data distribution queue (`struct DDQueueData`)**: It receives shards to be relocated (i.e., RelocateShards), decides which shard should be moved to which server team, prioritizes the data movement based on relocate shard’s priority, and controls the progress of data movement based on servers’ workload. +**Data distribution queue (`struct DDQueue`)**: It receives shards to be relocated (i.e., RelocateShards), decides which shard should be moved to which server team, prioritizes the data movement based on relocate shard’s priority, and controls the progress of data movement based on servers’ workload. **Special keys in the system keyspace**: DD saves its state in the system keyspace to recover from failure and to ensure every process (e.g., commit proxies, tLogs and storage servers) has a consistent view of which storage server is responsible for which key range. @@ -69,10 +69,11 @@ When a data distribution role is created, it recovers the states of the previous ### When to move keys? Keys can be moved from a server to another for several reasons: -(1) DD moves keys from overutilized servers to underutilized servers, where a server’s utilization is defined as the server’s disk usage; -(2) DD splits or merges shards in order to rebalance the disk usage of servers; -(3) DD removes redundant teams when the team number is larger than the desired number; -(4) DD repairs the replication factor by duplicate shards from a server to another when servers in a team fail. +(1) DD moves keys from disk-overutilized servers to disk-underutilized servers, where a server’s disk-utilization is defined as the server’s disk space usage; +(2) DD moves keys from read-busy servers to read-cold servers if read-aware data distribution is enabled; +(3) DD splits or merges shards in order to rebalance the disk usage of servers; +(4) DD removes redundant teams when the team number is larger than the desired number; +(5) DD repairs the replication factor by duplicate shards from a server to another when servers in a team fail. Actors are created to monitor the reasons of key movement: (1) `MountainChopper` and `ValleyFiller` actors periodically measure a random server team’s utilization and rebalance the server’s keys among other servers; @@ -93,3 +94,84 @@ The data movement from one server (called source server) to another (called dest (2) The destination server will issue transactions to read the shard range and write the key-value pairs back. The key-value will be routed to the destination server and saved in the server’s storage engine; (3) DD removes the source server from the shard’s ownership by modifying the system keyspace; (4) DD removes the shard’s information owned by the source server from the server’s team information (i.e., *shardsAffectedByTeamFailure*). + +# Read-aware Data Distribution + +## Motivation +Before FDB 7.2, when the data distributor wants to rebalance shard, it only considers write bandwidth when choosing source and destination team, and the moved shard is chosen randomly. There are several cases where uneven read distribution from users causes a small subset of servers to be busy with read requests. This motivates the data distributor considering read busyness to minimize the read load unevenness. + +## When does read rebalance happen +The data distributor will periodically check whether the read rebalance is needed. The conditions of rebalancing are +* the **worst CPU usage of source team >= 0.15** , which means the source team is somewhat busy; +* the ongoing relocation is less than the parallelism budget. `queuedRelocation[ priority ] < countLimit (default 50)`; +* the source team is not throttled to be a data movement source team. `( now() - The last time the source team was selected ) * time volumn (default 20) > read sample interval (2 min default)`; +* the read load difference between source team and destination team is larger than 30% of the source team load; + +## Metrics definition +* READ_LOAD = ceil(READ_BYTES_PER_KSECOND / PAGE_SIZE) +* READ_IMBALANCE = ( MAX READ_LOAD / AVG READ_LOAD ) +* MOVE_SCORE = READ_DENSITY = READ_BYTES_PER_KSECOND / SHARD_BYTE + +The aim for read-aware data distributor is to minimize the IMBALANCE while not harm the disk utilization balance. + +## Which shard to move +Basically, the MountainChopper will handle read-hot shards distribution with following steps: +1. The MountainChopper chooses **the source team** with the largest READ_LOAD while it satisfies HARD_CONSTRAINT, then check whether rebalance is needed; + * Hard constraint: + * Team is healthy + * The last time this team was source team is larger than (READ_SAMPLE_INTERVAL / MOVEMENT_PER_SAMPLE) + * The worst CPU usage of source team >= 0.15 +2. Choose the destination team for moving + * Hard constraint: + * Team is healthy + * The team’s available space is larger than the median free space + * Goals + * The destination team has the least LOAD in a random team set while it satisfies HARD_CONSTRAINT; +3. Select K shards on the source team of which + a. `LOAD(shard) < (LOAD(src) - LOAD(dest)) * READ_REBALANCE_MAX_SHARD_FRAC `; + b. `LOAD(shard) > AVG(SourceShardLoad)`; + c. with the highest top-K `MOVE_SCORE`; + + We use 3.a and 3.b to set a eligible shard bandwidth for read rebalance moving. If the upper bound is too large, it’ll just make the hot shard shift to another team but not even the read load. If the upper bound is small, we’ll just move some cold shards to other servers, which is also not helpful. The default value of READ_REBALANCE_MAX_SHARD_FRAC is 0.2 (up to 0.5) which is decided based on skewed workload test. +4. Issue relocation request to move a random shard in the top k set. If the maximum limit of read-balance movement is reached, give up this relocation. + +Note: The ValleyFiller chooses a source team from a random set with the largest LOAD, and a destination team with the least LOAD. + +## Performance Test and Summary +### Metrics to measure +1. StorageMetrics trace event report “FinishedQueries” which means the current storage server finishes how many read operations. The rate of FinishedQueries is what we measure first. The better the load balance is, the more similar the FinishedQueries rate across all storage servers. +CPU utilization. This metric is in a positive relationship with “FinishedQueries rate”. A even “FinishedQueries” generally means even CPU utilization in the read-only scenario. +2. Data movement size. We want to achieve load balance with as little movement as possible; +3. StandardDeviation(FinishedQueries). It indicates how much difference read load each storage server has. + +### Typical Test Setup +120GB data, key=32B, value=200B; Single replica; 8 SS (20%) serves 80% read; 8 SS servers 60% write; 4 servers are both read and write hot; TPS=100000, 7 read/txn + 1 write/txn; + +### Test Result Summary and Recommendation +* With intersected sets of read-hot and write-hot servers, read-aware DD even out the read + write load on the double-hot (be both read and write hot) server, which means the converged write load is similar to disk rebalance only algorithm. +* Read-aware DD will balance the read workload under the read-skew scenario. Starting from an imbalance `STD(FinishedQueries per minute)=16k`,the best result it can achieve is `STD(FinishedQueries per minute) = 2k`. +* The typical movement size under a read-skew scenario is 100M ~ 600M under default KNOB value `READ_REBALANCE_MAX_SHARD_FRAC=0.2, READ_REBALANCE_SRC_PARALLELISM = 20`. Increasing those knobs may accelerate the converge speed with the risk of data movement churn, which overwhelms the destination and over-cold the source. +* The upper bound of `READ_REBALANCE_MAX_SHARD_FRAC` is 0.5. Any value larger than 0.5 can result in hot server switching. +* When needing a deeper diagnosis of the read aware DD, `BgDDMountainChopper_New`, and `BgDDValleyFiller_New` trace events are where to go. + +## Data Distribution Diagnosis Q&A +* Why Read-aware DD hasn't been triggered when there's a read imbalance? + * Check `BgDDMountainChopper_New`, `BgDDValleyFiller_New` `SkipReason` field. +* The Read-aware DD is triggered, and some data movement happened, but it doesn't help the read balance. Why? + * Need to figure out which server is selected as the source and destination. The information is in `BgDDMountainChopper*`, `BgDDValleyFiller*` `DestTeam` and `SourceTeam` field. + * Also, the `DDQueueServerCounter` event tells how many times a server being a source or destination (defined in + ```c++ + enum CountType : uint8_t { ProposedSource = 0, QueuedSource, LaunchedSource, LaunchedDest }; + ``` + ) for different relocation reason (`Other`, `RebalanceDisk` and so on) in different phase within `DD_QUEUE_COUNTER_REFRESH_INTERVAL` (default 60) seconds. For example, + ```xml + + ``` + `RebalanceReadPQSD="2 0 0 5"` means server `0000000000000004` has been selected as for read balancing for twice, but it's not queued and executed yet. This server also has been a destination for read balancing for 5 times in the past 1 min. Note that the field will be skipped if all 4 numbers are 0. To avoid spammy traces, if is enabled with knob `DD_QUEUE_COUNTER_SUMMARIZE = true`, event `DDQueueServerCounterTooMany` will summarize the unreported servers that involved in launched relocations (aka. `LaunchedSource`, `LaunchedDest` count are non-zero): + ```xml + + ``` +* How to track the lifecycle of a relocation attempt for balancing? + * First find the TraceId fields in `BgDDMountainChopper*`, `BgDDValleyFiller*`, which indicates a relocation is triggered. + * (Only when enabled) Find the `QueuedRelocation` event with the same `BeginPair` and `EndPair` as the original `TraceId`. This means the relocation request is queued. + * Find the `RelocateShard` event whose `BeginPair`, `EndPair` field is the same as `TraceId`. This event means the relocation is ongoing. diff --git a/design/dynamic-knobs.md b/design/dynamic-knobs.md new file mode 100644 index 0000000000..00fe39e725 --- /dev/null +++ b/design/dynamic-knobs.md @@ -0,0 +1,420 @@ +# Dynamic Knobs + +This document is largely adapted from original design documents by Markus +Pilman and Trevor Clinkenbeard. + +## Background + +FoundationDB parameters control the behavior of the database, including whether +certain features are available and the value of internal constants. Parameters +will be referred to as knobs for the remainder of this document. Currently, +these knobs are configured through arguments passed to `fdbserver` processes, +often controlled by `fdbmonitor`. This has a number of problems: + +1. Updating knobs involves updating `foundationdb.conf` files on each host in a + cluster. This has a lot of overhead and typically requires external tooling + for large scale changes. +2. All knob changes require a process restart. +3. We can't easily track the history of knob changes. + +## Overview + +The dynamic knobs project creates a strictly serializable quorum-based +configuration database stored on the coordinators. Each `fdbserver` process +specifies a configuration path and applies knob overrides from the +configuration database for its specified classes. + +### Caveats + +The configuration database explicitly does not support the following: + +1. A high load. The update rate, while not specified, should be relatively low. +2. A large amount of data. The database is meant to be relatively small (under + one megabyte). Data is not sharded and every coordinator stores a complete + copy. +3. Concurrent writes. At most one write can succeed at a time, and clients must + retry their failed writes. + +## Design + +### Configuration Path + +Each `fdbserver` process can now include a `--config_path` argument specifying +its configuration path. A configuration path is a hierarchical list of +configuration classes specifying which knob overrides the `fdbserver` process +should apply from the configuration database. For example: + +```bash +$ fdbserver --config_path classA/classB/classC ... +``` + +Knob overrides follow descending priority: + +1. Manually specified command line knobs. +2. Individual configuration class overrides. + * Subdirectories override parent directories. For example, if the + configuration path is `az-1/storage/gp3`, the `gp3` configuration takes + priority over the `storage` configuration, which takes priority over the + `az-1` configuration. +3. Global configuration knobs. +4. Default knob values. + +#### Example + +For example, imagine an `fdbserver` process run as follows: + +```bash +$ fdbserver --datadir /mnt/fdb/storage/4500 --logdir /var/log/foundationdb --public_address auto:4500 --config_path az-1/storage/gp3 --knob_disable_asserts false +``` + +And the configuration database contains: + +| ConfigClass | KnobName | KnobValue | +|-------------|---------------------|-----------| +| az-2 | page_cache_4k | 8e9 | +| storage | min_trace_severity | 20 | +| az-1 | compaction_interval | 280 | +| storage | compaction_interval | 350 | +| az-1 | disable_asserts | true | +| \ | max_metric_size | 5000 | +| gp3 | max_metric_size | 1000 | + +The final configuration for the process will be: + +| KnobName | KnobValue | Explanation | +|---------------------|-------------|-------------| +| page_cache_4k | \ | The configuration database knob override for `az-2` is ignored, so the compiled default is used | +| min_trace_severity | 20 | Because the `storage` configuration class is part of the process’s configuration path, the corresponding knob override is applied from the configuration database | +| compaction_interval | 350 | The `storage` knob override takes precedence over the `az-1` knob override | +| disable_asserts | false | This knob is manually overridden, so all other overrides are ignored | +| max_metric_size | 1000 | Knob overrides for specific configuration classes take precedence over global knob overrides, so the global override is ignored | + +### Clients + +Clients can write to the configuration database using transactions. +Configuration database transactions are differentiated from regular +transactions through specification of the `USE_CONFIG_DATABASE` database +option. + +In configuration transactions, the client uses the tuple layer to interact with +the configuration database. Keys are tuples of size two, where the first item +is the configuration class being written, and the second item is the knob name. +The value should be specified as a string. It will be converted to the +appropriate type based on the declared type of the knob being set. + +Below is a sample Python script to write to the configuration database. + +```python +import fdb + +fdb.api_version(720) + +@fdb.transactional +def set_knob(tr, knob_name, knob_value, config_class, description): + tr['\xff\xff/description'] = description + tr[fdb.tuple.pack((config_class, knob_name,))] = knob_value + +# This function performs two knob changes transactionally. +@fdb.transactional +def set_multiple_knobs(tr): + tr['\xff\xff/description'] = 'description' + tr[fdb.tuple.pack((None, 'min_trace_severity',))] = '10' + tr[fdb.tuple.pack(('az-1', 'min_trace_severity',))] = '20' + +db = fdb.open() +db.options.set_use_config_database() + +set_knob(db, 'min_trace_severity', '10', None, 'description') +set_knob(db, 'min_trace_severity', '20', 'az-1', 'description') +``` + +### Disable the Configuration Database + +The configuration database includes both client and server changes and is +enabled by default. Thus, to disable the configuration database, changes must +be made to both. + +#### Server + +The configuration database can be disabled by specifying the ``fdbserver`` +command line option ``--no-config-db``. Note that this option must be specified +for *every* ``fdbserver`` process. + +#### Client + +The only client change from the configuration database is as part of the change +coordinators command. The change coordinators command is not considered +successful until the configuration database is readable on the new +coordinators. This will cause the change coordinators command to hang if run +against a database with dynamic knobs disabled. To disable the client side +configuration database liveness check, specify the ``--no-config-db`` flag when +changing coordinators. For example: + +``` +fdbcli> coordinators auto --no-config-db +``` + +## Status + +The current state of the configuration database is output as part of `status +json`. The configuration path for each process can be determined from the +``command_line`` key associated with each process. + +Sample from ``status json``: + +``` +"configuration_database" : { + "commits" : [ + { + "description" : "set some knobs", + "timestamp" : 1659570000, + "version" : 1 + }, + { + "description" : "make some other changes", + "timestamp" : 1659570000, + "version" : 2 + } + ], + "last_compacted_version" : 0, + "most_recent_version" : 2, + "mutations" : [ + { + "config_class" : "", + "knob_name" : "min_trace_severity", + "knob_value" : "int:5", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "", + "knob_name" : "compaction_interval", + "knob_value" : "double:30.000000", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "az-1", + "knob_name" : "compaction_interval", + "knob_value" : "double:60.000000", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "", + "knob_name" : "compaction_interval", + "type" : "clear", + "version" : 2 + }, + { + "config_class" : "", + "knob_name" : "update_node_timeout", + "knob_value" : "double:4.000000", + "type" : "set", + "version" : 2 + } + ], + "snapshot" : { + "" : { + "min_trace_severity" : "int:5", + "update_node_timeout" : "double:4.000000" + }, + "az-1" : { + "compaction_interval" : "double:60.000000" + } + } +} +``` + +After compaction, ``status json`` would show: + +``` +"configuration_database" : { + "commits" : [ + ], + "last_compacted_version" : 2, + "most_recent_version" : 2, + "mutations" : [ + ], + "snapshot" : { + "" : { + "min_trace_severity" : "int:5", + "update_node_timeout" : "double:4.000000" + }, + "az-1" : { + "compaction_interval" : "double:60.000000" + } + } +} +``` + +## Detailed Implementation + +The configuration database is implemented as a replicated state machine living +on the coordinators. This allows configuration database transactions to +continue to function in the event of a catastrophic loss of the transaction +subsystem. + +To commit a transaction, clients run the two phase Paxos protocol. First, the +client asks for a live version from a quorum of coordinators. When a +coordinator receives a request for its live version, it increments its local +live version by one and returns it to the client. Then, the client submits its +writes at the live version it received in the previous step. A coordinator will +accept the commit if it is still on the same live version. If a majority of +coordinators accept the commit, it is considered committed. + +### Coordinator + +Each coordinator runs a ``ConfigNode`` which serves as a replica storing one +full copy of the configuration database. Coordinators never communicate with +other coordinators while processing configuration database transactions. +Instead, the client runs the transaction and determines when it has quorum +agreement. + +Coordinators serve the following ``ConfigTransactionInterface`` to allow +clients to read from and write to the configuration database. + +#### ``ConfigTransactionInterface`` +| Request | Request fields | Reply fields | Explanation | +|------------------|----------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------| +| GetGeneration | (coordinatorsHash) | (generation) or (coordinators_changed error) | Get a new read version. This read version is used for all future requests in the transaction | +| Get | (configuration class, knob name, coordinatorsHash, generation) | (knob value or empty) or (coordinators_changed error) or (transaction_too_old error) | Returns the current value stored at the specified configuration class and knob name, or empty if no value exists | +| GetConfigClasses | (coordinatorsHash, generation) | (configuration classes) or (coordinators_changed error) or (transaction_too_old error) | Returns a list of all configuration classes stored in the configuration database | +| GetKnobs | (configuration class, coordinatorsHash, generation) | (knob names) or (coordinators_changed error) or (transaction_too_old error) | Returns a list of all knob names stored for the provided configuration class | +| Commit | (mutation list, coordinatorsHash, generation) | ack or (coordinators_changed error) or (commit_unknown_result error) or (not_committed error) | Commit mutations set by the transaction | + +Coordinators also serve the following ``ConfigFollowerInterface`` to provide +access to (and modification of) their current state. Most interaction through +this interface is done by the cluster controller through its +``IConfigConsumer`` implementation living on the ``ConfigBroadcaster``. + +#### ``ConfigFollowerInterface`` +| Request | Request fields | Reply fields | Explanation | +|-----------------------|----------------------------------------------------------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| GetChanges | (lastSeenVersion, mostRecentVersion) | (mutation list, version) or (version_already_compacted error) or (process_behind error) | Request changes since the last seen version, receive a new most recent version, as well as recent mutations | +| GetSnapshotAndChanges | (mostRecentVersion) | (snapshot, snapshotVersion, changes) | Request the full configuration database, in the form of a base snapshot and changes to apply on top of the snapshot | +| Compact | (version) | ack | Compact mutations up to the provided version | +| Rollforward | (rollbackTo, lastKnownCommitted, target, changes, specialZeroQuorum) | ack or (version_already_compacted error) or (transaction_too_old error) | Rollback/rollforward mutations on a node to catch it up with the majority | +| GetCommittedVersion | () | (registered, lastCompacted, lastLive, lastCommitted) | Request version information from a ``ConfigNode`` | +| Lock | (coordinatorsHash) | ack | Lock a ``ConfigNode`` to prevent it from serving requests during a coordinator change | + +### Cluster Controller + +The cluster controller runs a singleton ``ConfigBroadcaster`` which is +responsible for periodically polling the ``ConfigNode``s for updates, then +broadcasting these updates to workers through the ``ConfigBroadcastInterface``. +When workers join the cluster, they register themselves and their +``ConfigBroadcastInterface`` with the broadcaster. The broadcaster then pushes +new updates to registered workers. + +The ``ConfigBroadcastInterface`` is also used by ``ConfigNode``s to register +with the ``ConfigBroadcaster``. ``ConfigNode``s need to register with the +broadcaster because the broadcaster decides when the ``ConfigNode`` may begin +serving requests, based on global information about status of other +``ConfigNode``s. For example, if a system with three ``ConfigNode``s suffers a +fault where one ``ConfigNode`` loses data, the faulty ``ConfigNode`` should +not be allowed to begin serving requests again until it has been rolled forward +and is up to date with the latest state of the configuration database. + +#### ``ConfigBroadcastInterface`` + +| Request | Request fields | Reply fields | Explanation | +|------------|------------------------------------------------------------|-------------------------------|---------------------------------------------------------------------------------------------| +| Snapshot | (snapshot, version, restartDelay) | ack | A snapshot of the configuration database sent by the broadcaster to workers | +| Changes | (changes, mostRecentVersion, restartDelay) | ack | A list of changes up to and including mostRecentVersion, sent by the broadcaster to workers | +| Registered | () | (registered, lastSeenVersion) | Sent by the broadcaster to new ``ConfigNode``s to determine their registration status | +| Ready | (snapshot, snapshotVersion, liveVersion, coordinatorsHash) | ack | Sent by the broadcaster to new ``ConfigNode``s to allow them to start serving requests | + +### Worker + +Each worker runs a ``LocalConfiguration`` instance which receives and applies +knob updates from the ``ConfigBroadcaster``. The local configuration maintains +a durable ``KeyValueStoreMemory`` containing the following: + +* The latest known configuration version +* The most recently used configuration path +* All knob overrides corresponding to the configuration path at the latest known version + +Once a worker starts, it will: + +* Apply manually set knobs +* Read its local configuration file + * If the stored configuration path does not match the configuration path + specified on the command line, delete the local configuration file + * Otherwise, apply knob updates from the local configuration file. Manually + specified knobs will not be overridden + * Register with the broadcaster to receive new updates for its configuration + classes + * Persist these updates when received and restart if necessary + +### Knob Atomicity + +All knobs are classified as either atomic or non-atomic. Atomic knobs require a +process restart when changed, while non-atomic knobs do not. + +### Compaction + +``ConfigNode``s store individual mutations in order to be able to update other, +out of date ``ConfigNode``s without needing to send a full snapshot. Each +configuration database commit also contains additional metadata such as a +timestamp and a text description of the changes being made. To keep the size of +the configuration database manageable, a compaction process runs periodically +(defaulting to every five minutes) which compacts individual mutations into a +simplified snapshot of key-value pairs. Compaction is controlled by the +``ConfigBroadcaster``, using information it peridiodically requests from +``ConfigNode``s. Compaction will only compact up to the minimum known version +across *all* ``ConfigNode``s. This means that if one ``ConfigNode`` is +permanently partitioned from the ``ConfigBroadcaster`` or from clients, no +compaction will ever take place. + +### Rollback / Rollforward + +It is necessary to be able to roll ``ConfigNode``s backward and forward with +respect to their committed versions due to the nature of quorum logic and +unreliable networks. + +Consider a case where a client commit gets persisted durably on one out of +three ``ConfigNode``s (assume commit messages to the other two nodes are lost). +Since the value is not committed on a majority of ``ConfigNode``s, it cannot be +considered committed. But it is also incorrect to have the value persist on one +out of three nodes as future commits are made. In this case, the most common +result is that the ``ConfigNode`` will be rolled back when the next commit from +a different client is made, and then rolled forward to contain the data from +the commit. ``PaxosConfigConsumer`` contains logic to recognize ``ConfigNode`` +minorities and update them to match the quorum. + +### Changing Coordinators + +Since the configuration database lives on the coordinators and the +[coordinators can be +changed](https://apple.github.io/foundationdb/configuration.html#configuration-changing-coordination-servers), +it is necessary to copy the configuration database from the old to the new +coordinators during such an event. A coordinator change performs the following +steps in regards to the configuration database: + +1. Write ``\xff/coordinatorsKey`` with the new coordinators string. The key + ``\xff/previousCoordinators`` contains the current (old) set of + coordinators. +2. Lock the old ``ConfigNode``s so they can no longer serve client requests. +3. Start a recovery, causing a new cluster controller (and therefore + ``ConfigBroadcaster``) to be selected. +4. Read ``\xff/previousCoordinators`` on the ``ConfigBroadcaster`` and, if + present, read an up-to-date snapshot of the configuration database on the + old coordinators. +5. Determine if each registering ``ConfigNode`` needs an up-to-date snapshot of + the configuration database sent to it, based on its reported version and the + snapshot version of the database received from the old coordinators. + * Some new coordinators which were also coordinators in the previous + configuration may not need a snapshot. +6. Send ready requests to new ``ConfigNode``s, including an up-to-date snapshot + if necessary. This allows the new coordinators to begin serving + configuration database requests from clients. + +## Testing + +The ``ConfigDatabaseUnitTests`` class unit test a number of different +configuration database dimensions. + +The ``ConfigIncrement`` workload tests contention between clients attempting to +write to the configuration database, paired with machine failure and +coordinator changes. diff --git a/design/global-tag-throttling.md b/design/global-tag-throttling.md new file mode 100644 index 0000000000..ad7750a3cc --- /dev/null +++ b/design/global-tag-throttling.md @@ -0,0 +1,131 @@ +## Global Tag Throttling + +When the `GLOBAL_TAG_THROTTLING` knob is enabled, the ratekeeper will use the [transaction tagging feature](https://apple.github.io/foundationdb/transaction-tagging.html) to throttle tags according to the global tag throttling algorithm. This page describes the implementation of this algorithm. + +### Tag Quotas +The global tag throttler bases throttling decisions on "quotas" provided by clients through the tag quota API. Each tag quota has two components: + +* Reserved quota +* Total quota + +The global tag throttler cannot throttle tags to a throughput below the reserved quota, and it cannot allow throughput to exceed the total quota. + +### Cost +Internally, the units for these quotas are "page costs", computed as follows. The "page cost" of a read operation is computed as: + +``` +readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR); +``` + +The "page cost" of a write operation is computed as: + +``` +writeCost = SERVER_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR); +``` + +Here `bytesWritten` includes cleared bytes. The size of range clears is estimated at commit time. + +### Tuple Layer +Tag quotas are stored inside of the system keyspace (with prefix `\xff/tagQuota/`). They are stored using the tuple layer, in a tuple of form: `(reservedQuota, totalQuota)`. There is currently no custom code in the bindings for manipulating these system keys. However, in any language for which bindings are available, it is possible to use the tuple layer to manipulate tag quotas. + +### fdbcli +The easiest way for an external client to interact with tag quotas is through `fdbcli`. To get the quota (in bytes/second) of a particular tag, run the following command: + +``` +fdbcli> quota get [reserved_throughput|total_throughput] +``` + +To set the quota through `fdbcli`, run: + +``` +fdbcli> quota set [reserved_throughput|total_throughput] +``` + +Note that the quotas are specified in terms of bytes/second, and internally converted to page costs: + +``` +page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR) +``` + +### Limit Calculation +The transaction budget that ratekeeper calculates and distributes to clients (via GRV proxies) for each tag is calculated based on several intermediate rate calculations, outlined in this section. + +* Reserved Rate: Based on reserved quota and the average transaction cost, a reserved TPS rate is computed for each tag. + +* Desired Rate: Based on total quota and the average transaction cost, a desired TPS rate is computed for each tag. + +* Limiting Rate: When a storage server is near saturation, tags contributing notably to the workload on this storage server will receive a limiting TPS rate, computed to relieve the workload on the storage server. + +* Target Rate: The target rate is the cluster-wide rate enforced by the global tag throttler. This rate is computed as: + +``` +targetTps = max(reservedTps, min(desiredTps, limitingTps)); +``` + +* Per-Client Rate: While the target rate represents the cluster-wide desired throughput according to the global tag throttler, this budget must be shared across potentially many clients. Therefore, based on observed throughput from various clients, each client will receive an equal budget based on a per-client, per-tag rate computed by the global tag throttler. This rate is in the end what will be sent to clients to enforce throttling. + +## Implementation + +### Stat collection +The transaction rates and costs of all transactions must be visible to the global tag throttler. Whenever a client tags a transaction, sampling is performed to determine whether to attach the tag to messages sent to storage servers and commit proxies. + +For read operations that are sampled (with probability `CLIENT_KNOBS->READ_TAG_SAMPLE_RATE`), read costs are aggregated on storage servers using the `TransactionTagCounter` class. This class tracks the busyness of the top-k tags affecting the storage server with read load (here `k` is determined by `SERVER_KNOBS->SS_THROTTLE_TAGS_TRACKED`). Storage servers periodically send per-tag read cost statistics to the ratekeeper through `StorageQueuingMetricsReply` messages. + +For write operations that are sampled (with probability `COMMIT_SAMPLE_COST`), write costs are aggregated on commit proxies in the `ProxyCommitData::ssTrTagCommitCost` object. Per-storage, per-tag write cost statistics are periodically sent from commit proxies to the ratekeeper through `ReportCommitCostEstimationRequest` messages. + +The ratekeeper tracks per-storage, per-tag cost statistics in the `GlobalTagThrottlerImpl::throughput` object. + +The ratekeeper must also track the rate of transactions performed with each tag. Each GRV proxy agreggates a per-tag counter of transactions started (without sampling). These are sent to the ratekeeper through `GetRateInfoRequest` messages. The global tag throttler then tracks per tag transaction rates in the `GlobalTagThrottlerImpl::tagStatistics` object. + +### Average Cost Calculation +Quotas are expressed in terms of cost, but because throttling is enforced at the beginning of transactions, budgets need to be calculated in terms of transactions per second. To make this conversion, it is necessary to track the average cost of transactions (per-tag, and per-tag on a particular storage server). + +Both cost and transaction counters are smoothed using the `Smoother` class to provide stability over time. The "smoothing interval" can be modified through `SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME`. + +### Reserved Rate Calculation +The global tag throttler periodically reads reserved quotas from the system keyspace. Using these reserved quotas and the average cost of transactions with the given tag, a reserved TPS rate is computed. Read and write rates are aggregated as follows: + +``` +reservedTps = max(reservedReadTps, reservedWriteTps); +``` + +### Desired Rate Calculation +Similar to reserved rate calculation, the total quota is read from the system key space. Then, using the average cost of transactions with the given tag, a desired TPS rate is computed. Read and write rates are aggregated as follows: + +``` +desiredTps = min(desiredReadTps, desiredWriteTps); +``` + +### Limiting Rate Calculation +In addition to tag busyness statistics, the `StorageQueuingMetricsReply` messages sent from storage servers to the ratekeeper also contain metrics on the health of storage servers. The ratekeeper uses these metrics as part of its calculation of a global transaction rate (independent of tag throttling). + +The global tag throttler also makes use of these metrics to compute a "throttling ratio" for each storage server. This throttling ratio is computed in `StorageQueueInfo::getThrottlingRatio`. The global tag throttler uses the throttling ratio for each tracked storage server to compute a "limiting transaction rate" for each combination of storage server and tag. + +In the "healthy" case where no metrics are near saturation, the throttling ratio will be an empty `Optional`, indicating that the storage server is not near saturation. If, on the other hand, the metrics indicate approaching saturation, the throttling ratio will be a number between 0 and 2 indicating the ratio of current throughput the storage server can serve. In this case, the global tag throttler looks at the current cost being served by the storage server, multiplies it by the throttling ratio, and computes a limiting cost for the storage server. Among all tags using significant resources on this storage server, this limiting cost is divided up according to the relative total quotas allocated to these tags. Next, a transaction limit is determined for each tag, based on how much the average transaction for the given tag affects the given storage server. + +These per-tag, per-storage limiting transaction rates are aggregated to compute per-tag limiting transaction rates: + +``` +limitingTps(tag) = min{limitingTps(tag, storage) : all storage servers} +``` + +If the throttling ratio is empty for all storage servers affected by a tag, then the per-tag, per-storage limiting TPS rate is also empty. In this case the target rate for this tag is simply the desired rate. + +If an individual zone is unhealthy, it may cause the throttling ratio for storage servers in that zone to shoot up. This should not be misinterpretted as a workload issue that requires active throttling. Therefore, the zone with the worst throttling ratios is ignored when computing the limiting transaction rate for a tag (similar to the calculation of the global transaction limit in `Ratekeeper::updateRate`). + +### Client Rate Calculation +The smoothed per-client rate for each tag is tracked within `GlobalTagThrottlerImpl::PerTagStatistics`. Once a target rate has been computed, this is passed to `GlobalTagThrotterImpl::PerTagStatistics::updateAndGetPerClientRate` which adjusts the per-client rate. The per-client rate is meant to limit the busiest clients, so that at equilibrium, the per-client rate will remain constant and the sum of throughput from all clients will match the target rate. + +## Testing +The `GlobalTagThrottling.toml` test provides a simple end-to-end test using the global tag throttler. Quotas are set using the internal tag quota API in the `GlobalTagThrottling` workload. This is run in parallel with the `ReadWrite` workload, which tags transactions. The number of `transaction_tag_throttled` errors is reported, along with the throughput, which should be roughly predictable based on the quota parameters chosen. + +In addition to this end-to-end test, there is a suite of unit tests with the `/GlobalTagThrottler/` prefix. These tests run in a mock environment, with mock storage servers providing simulated storage queue statistics and tag busyness reports. Mock clients simulate workload on these mock storage servers, and get throttling feedback directly from a global tag throttler which is monitoring the mock storage servers. + +In each test, the `GlobalTagThrottlerTesting::monitor` function is used to periodically check whether or not a desired equilibrium state has been reached. If the desired state is reached and maintained for a sufficient period of time, the test passes. If the unit test is unable to reach this desired equilibrium state before a timeout, the test will fail. Commonly, the desired state is for the global tag throttler to report a client rate sufficiently close to the desired rate specified as an input to the `GlobalTagThrottlerTesting::rateIsNear` function. + +## Visibility + +### Tracing +On the ratekeeper, every `SERVER_KNOBS->TAG_THROTTLE_PUSH_INTERVAL` seconds, the ratekeeper will call `GlobalTagThrottler::getClientRates`. At the end of the rate calculation for each tag, a trace event of type `GlobalTagThrottler_GotClientRate` is produced. This trace event reports the relevant inputs that went in to the rate calculation, and can be used for debugging. + +On storage servers, every `SERVER_KNOBS->TAG_MEASUREMENT_INTERVAL` seconds, there are `BusyReadTag` events for every tag that has sufficient read cost to be reported to the ratekeeper. Both cost and fractional busyness are reported. diff --git a/design/special-key-space.md b/design/special-key-space.md index 7cdcfe460d..be104915fe 100644 --- a/design/special-key-space.md +++ b/design/special-key-space.md @@ -32,10 +32,10 @@ public: explicit SKRExampleImpl(KeyRangeRef kr): SpecialKeyRangeReadImpl(kr) { // Our implementation is quite simple here, the key-value pairs are formatted as: // \xff\xff/example/ : - CountryToCapitalCity[LiteralStringRef("USA")] = LiteralStringRef("Washington, D.C."); - CountryToCapitalCity[LiteralStringRef("UK")] = LiteralStringRef("London"); - CountryToCapitalCity[LiteralStringRef("Japan")] = LiteralStringRef("Tokyo"); - CountryToCapitalCity[LiteralStringRef("China")] = LiteralStringRef("Beijing"); + CountryToCapitalCity["USA"_sr] = "Washington, D.C."_sr; + CountryToCapitalCity["UK"_sr] = "London"_sr; + CountryToCapitalCity["Japan"_sr] = "Tokyo"_sr; + CountryToCapitalCity["China"_sr] = "Beijing"_sr; } // Implement the getRange interface Future getRange(ReadYourWritesTransaction* ryw, @@ -58,7 +58,7 @@ private: }; // Instantiate the function object // In development, you should have a function object pointer in DatabaseContext(DatabaseContext.h) and initialize in DatabaseContext's constructor(NativeAPI.actor.cpp) -const KeyRangeRef exampleRange(LiteralStringRef("\xff\xff/example/"), LiteralStringRef("\xff\xff/example/\xff")); +const KeyRangeRef exampleRange("\xff\xff/example/"_sr, "\xff\xff/example/\xff"_sr); SKRExampleImpl exampleImpl(exampleRange); // Assuming the database handler is `cx`, register to special-key-space // In development, you should register all function objects in the constructor of DatabaseContext(NativeAPI.actor.cpp) @@ -67,16 +67,16 @@ cx->specialKeySpace->registerKeyRange(exampleRange, &exampleImpl); state ReadYourWritesTransaction tr(cx); // get Optional res1 = wait(tr.get("\xff\xff/example/Japan")); -ASSERT(res1.present() && res.getValue() == LiteralStringRef("Tokyo")); +ASSERT(res1.present() && res.getValue() == "Tokyo"_sr); // getRange // Note: for getRange(key1, key2), both key1 and key2 should prefixed with \xff\xff // something like getRange("normal_key", "\xff\xff/...") is not supported yet -RangeResult res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff"))); +RangeResult res2 = wait(tr.getRange("\xff\xff/example/U"_sr, "\xff\xff/example/U\xff"_sr)); // res2 should contain USA and UK ASSERT( res2.size() == 2 && - res2[0].value == LiteralStringRef("London") && - res2[1].value == LiteralStringRef("Washington, D.C.") + res2[0].value == "London"_sr && + res2[1].value == "Washington, D.C."_sr ); ``` diff --git a/documentation/sphinx/conf.py b/documentation/sphinx/conf.py index 031c7d6f67..04ed43d87b 100644 --- a/documentation/sphinx/conf.py +++ b/documentation/sphinx/conf.py @@ -69,7 +69,7 @@ release = root.find(".//{http://schemas.microsoft.com/developer/msbuild/2003}Ver # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -language = None +language = 'en' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -185,7 +185,7 @@ html_show_copyright = True htmlhelp_basename = 'FoundationDB' # Disable permalinks -html_add_permalinks = "" +html_permalinks = False # -- Options for LaTeX output -------------------------------------------------- diff --git a/documentation/sphinx/extensions/rubydomain.py b/documentation/sphinx/extensions/rubydomain.py index 1e5fb0bce4..e6c849cc41 100755 --- a/documentation/sphinx/extensions/rubydomain.py +++ b/documentation/sphinx/extensions/rubydomain.py @@ -42,7 +42,7 @@ from docutils.parsers.rst import directives, Directive from sphinx import addnodes from sphinx.roles import XRefRole -from sphinx.locale import l_, _ +from sphinx.locale import _ from sphinx.domains import Domain, ObjType, Index from sphinx.directives import ObjectDescription from sphinx.util.nodes import make_refnode @@ -83,18 +83,18 @@ class RubyObject(ObjectDescription): } doc_field_types = [ - TypedField('parameter', label=l_('Parameters'), + TypedField('parameter', label=_('Parameters'), names=('param', 'parameter', 'arg', 'argument'), typerolename='obj', typenames=('paramtype', 'type')), - TypedField('variable', label=l_('Variables'), rolename='obj', + TypedField('variable', label=_('Variables'), rolename='obj', names=('var', 'ivar', 'cvar'), typerolename='obj', typenames=('vartype',)), - GroupedField('exceptions', label=l_('Raises'), rolename='exc', + GroupedField('exceptions', label=_('Raises'), rolename='exc', names=('raises', 'raise', 'exception', 'except'), can_collapse=True), - Field('returnvalue', label=l_('Returns'), has_arg=False, + Field('returnvalue', label=_('Returns'), has_arg=False, names=('returns', 'return')), - Field('returntype', label=l_('Return type'), has_arg=False, + Field('returntype', label=_('Return type'), has_arg=False, names=('rtype',)), ] @@ -493,8 +493,8 @@ class RubyModuleIndex(Index): """ name = 'modindex' - localname = l_('Ruby Module Index') - shortname = l_('modules') + localname = _('Ruby Module Index') + shortname = _('modules') def generate(self, docnames=None): content = {} @@ -561,17 +561,17 @@ class RubyDomain(Domain): name = 'rb' label = 'Ruby' object_types = { - 'function': ObjType(l_('function'), 'func', 'obj'), - 'global': ObjType(l_('global variable'), 'global', 'obj'), - 'method': ObjType(l_('method'), 'meth', 'obj'), - 'class': ObjType(l_('class'), 'class', 'obj'), - 'exception': ObjType(l_('exception'), 'exc', 'obj'), - 'classmethod': ObjType(l_('class method'), 'meth', 'obj'), - 'attr_reader': ObjType(l_('attribute'), 'attr', 'obj'), - 'attr_writer': ObjType(l_('attribute'), 'attr', 'obj'), - 'attr_accessor': ObjType(l_('attribute'), 'attr', 'obj'), - 'const': ObjType(l_('const'), 'const', 'obj'), - 'module': ObjType(l_('module'), 'mod', 'obj'), + 'function': ObjType(_('function'), 'func', 'obj'), + 'global': ObjType(_('global variable'), 'global', 'obj'), + 'method': ObjType(_('method'), 'meth', 'obj'), + 'class': ObjType(_('class'), 'class', 'obj'), + 'exception': ObjType(_('exception'), 'exc', 'obj'), + 'classmethod': ObjType(_('class method'), 'meth', 'obj'), + 'attr_reader': ObjType(_('attribute'), 'attr', 'obj'), + 'attr_writer': ObjType(_('attribute'), 'attr', 'obj'), + 'attr_accessor': ObjType(_('attribute'), 'attr', 'obj'), + 'const': ObjType(_('const'), 'const', 'obj'), + 'module': ObjType(_('module'), 'mod', 'obj'), } directives = { diff --git a/documentation/sphinx/requirements.txt b/documentation/sphinx/requirements.txt index 06e23ea6d3..8e33b564f2 100644 --- a/documentation/sphinx/requirements.txt +++ b/documentation/sphinx/requirements.txt @@ -1,6 +1,6 @@ --index-url https://pypi.python.org/simple -setuptools>=20.10.0,<=57.4.0 -sphinx==1.5.6 -sphinx-bootstrap-theme==0.4.8 -docutils==0.16 -Jinja2==3.0.3 +setuptools==65.3.0 +sphinx==5.1.1 +sphinx-bootstrap-theme==0.8.1 +docutils==0.19 +Jinja2==3.1.2 diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst index 424d9a6f2a..018ffd66c5 100644 --- a/documentation/sphinx/source/administration.rst +++ b/documentation/sphinx/source/administration.rst @@ -12,6 +12,7 @@ Administration configuration moving-a-cluster tls + authorization This document covers the administration of an existing FoundationDB cluster. We recommend you read this document before setting up a cluster for performance testing or production use. diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst index 7e14d5c675..3ec3fca9bf 100644 --- a/documentation/sphinx/source/api-c.rst +++ b/documentation/sphinx/source/api-c.rst @@ -222,7 +222,7 @@ The FoundationDB client library performs most tasks on a singleton thread (which Future ====== -Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return :type:`FDBFuture*`. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which :func:`fdb_future_get_*()` function you should call). +Most functions in the FoundationDB API are asynchronous, meaning that they may return to the caller before actually delivering their result. These functions always return ``FDBFuture*``. An :type:`FDBFuture` object represents a result value or error to be delivered at some future time. You can wait for a Future to be "ready" -- to have a value or error delivered -- by setting a callback function, or by blocking a thread, or by polling. Once a Future is ready, you can extract either an error code or a value of the appropriate type (the documentation for the original function will tell you which ``fdb_future_get_()`` function you should call). To use the API in a synchronous way, you would typically do something like this for each asynchronous call:: @@ -282,7 +282,7 @@ See :ref:`developer-guide-programming-with-futures` for further (language-indepe .. type:: FDBCallback - A pointer to a function which takes :type:`FDBFuture*` and ``void*`` and returns ``void``. + A pointer to a function which takes ``FDBFuture*`` and ``void*`` and returns ``void``. .. function:: void fdb_future_release_memory(FDBFuture* future) @@ -298,13 +298,13 @@ See :ref:`developer-guide-programming-with-futures` for further (language-indepe .. function:: fdb_error_t fdb_future_get_int64(FDBFuture* future, int64_t* out) - Extracts a 64-bit integer from an :type:`FDBFuture*` into a caller-provided variable of type ``int64_t``. |future-warning| + Extracts a 64-bit integer from a pointer to :type:`FDBFuture` into a caller-provided variable of type ``int64_t``. |future-warning| |future-get-return1| |future-get-return2|. .. function:: fdb_error_t fdb_future_get_key_array( FDBFuture* f, FDBKey const** out_key_array, int* out_count) - Extracts an array of :type:`FDBKey` from an :type:`FDBFuture*` into a caller-provided variable of type ``FDBKey*``. The size of the array will also be extracted and passed back by a caller-provided variable of type ``int`` |future-warning| + Extracts an array of :type:`FDBKey` from an ``FDBFuture*`` into a caller-provided variable of type ``FDBKey*``. The size of the array will also be extracted and passed back by a caller-provided variable of type ``int`` |future-warning| |future-get-return1| |future-get-return2|. @@ -547,13 +547,13 @@ Applications must provide error handling and an appropriate retry loop around th .. function:: void fdb_transaction_set_read_version(FDBTransaction* transaction, int64_t version) - Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_transaction_too_old; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of :func:`fdb_transaction_get_*()` have been called on this transaction already, the result is undefined. + Sets the snapshot read version used by a transaction. This is not needed in simple cases. If the given version is too old, subsequent reads will fail with error_code_transaction_too_old; if it is too new, subsequent reads may be delayed indefinitely and/or fail with error_code_future_version. If any of ``fdb_transaction_get_*()`` have been called on this transaction already, the result is undefined. .. function:: FDBFuture* fdb_transaction_get_read_version(FDBTransaction* transaction) |future-return0| the transaction snapshot read version. |future-return1| call :func:`fdb_future_get_int64()` to extract the version into an int64_t that you provide, |future-return2| - The transaction obtains a snapshot read version automatically at the time of the first call to :func:`fdb_transaction_get_*()` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call. + The transaction obtains a snapshot read version automatically at the time of the first call to ``fdb_transaction_get_*()`` (including this one) and (unless causal consistency has been deliberately compromised by transaction options) is guaranteed to represent all transactions which were reported committed before that call. .. function:: FDBFuture* fdb_transaction_get(FDBTransaction* transaction, uint8_t const* key_name, int key_name_length, fdb_bool_t snapshot) @@ -829,7 +829,7 @@ Applications must provide error handling and an appropriate retry loop around th |future-returnvoid| - Callers will usually want to retry a transaction if the commit or a prior :func:`fdb_transaction_get_*()` returns a retryable error (see :func:`fdb_transaction_on_error()`). + Callers will usually want to retry a transaction if the commit or a prior ``fdb_transaction_get_*()`` returns a retryable error (see :func:`fdb_transaction_on_error()`). |commit-unknown-result-blurb| @@ -878,9 +878,9 @@ Applications must provide error handling and an appropriate retry loop around th .. function:: FDBFuture* fdb_transaction_on_error(FDBTransaction* transaction, fdb_error_t error) - Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other :func:`fdb_transaction_*()` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions. + Implements the recommended retry and backoff behavior for a transaction. This function knows which of the error codes generated by other ``fdb_transaction_*()`` functions represent temporary error conditions and which represent application errors that should be handled by the application. It also implements an exponential backoff strategy to avoid swamping the database cluster with excessive retries when there is a high level of conflict between transactions. - On receiving any type of error from an :func:`fdb_transaction_*()` function, the application should: + On receiving any type of error from an ``fdb_transaction_*()`` function, the application should: 1. Call :func:`fdb_transaction_on_error()` with the returned :type:`fdb_error_t` code. @@ -963,15 +963,15 @@ Key selectors In the FoundationDB C API, key selectors are not represented by a structure of any kind, but are instead expressed as sequential parameters to |get-key-func| and |get-range-func|. For convenience, the most common key selectors are available as C macros that expand to the appropriate parameters. -.. function:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length) +.. type:: FDB_KEYSEL_LAST_LESS_THAN(key_name, key_name_length) -.. function:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length) +.. type:: FDB_KEYSEL_LAST_LESS_OR_EQUAL(key_name, key_name_length) -.. function:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length) +.. type:: FDB_KEYSEL_FIRST_GREATER_THAN(key_name, key_name_length) -.. function:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length) +.. type:: FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(key_name, key_name_length) -To use one of these macros, simply replace the four parameters in the function with one of :func:`FDB_KEYSEL_*`:: +To use one of these macros, simply replace the four parameters in the function with one of ``FDB_KEYSEL_*``:: future = fdb_transaction_get_key(transaction, "key", 3, 0, 2, 0); diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst index c024c7d2d5..2577bdfc5f 100644 --- a/documentation/sphinx/source/api-python.rst +++ b/documentation/sphinx/source/api-python.rst @@ -194,10 +194,6 @@ After importing the ``fdb`` module and selecting an API version, you probably wa |option-tls-key-bytes| - .. method :: fdb.options.set_tls_verify_peers(verification_pattern) - - |option-tls-verify-peers| - .. method :: fdb.options.set_tls_ca_bytes(ca_bundle) |option-tls-ca-bytes| @@ -210,10 +206,6 @@ After importing the ``fdb`` module and selecting an API version, you probably wa |option-tls-password| - .. method :: fdb.options.set_disable_multi_version_client_api() - - |option-disable-multi-version-client-api| - .. method :: fdb.options.set_disable_local_client() |option-set-disable-local-client| @@ -761,10 +753,6 @@ In each of the methods below, ``param`` should be a string appropriately packed Committing ---------- -.. decorator:: transactional() - - The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional ` for explanation and examples. - .. method :: Transaction.commit() Attempt to commit the changes made in the transaction to the database. Returns a :class:`FutureVoid` representing the asynchronous result of the commit. You **must** call the :meth:`Future.wait()` method on the returned :class:`FutureVoid`, which will raise an exception if the commit failed. diff --git a/documentation/sphinx/source/architecture.rst b/documentation/sphinx/source/architecture.rst index 7c28518d74..f693865430 100644 --- a/documentation/sphinx/source/architecture.rst +++ b/documentation/sphinx/source/architecture.rst @@ -14,8 +14,12 @@ Detailed FoundationDB Architecture The FoundationDB architecture chooses a decoupled design, where processes are assigned different heterogeneous roles (e.g., -Coordinators, Storage Servers, Master). Scaling the database is achieved -by horizontally expanding the number of processes for separate roles: +Coordinators, Storage Servers, Master). Cluster attempts to recruit +different roles as separate processes, however, it is possible that +multiple Stateless roles gets colocated (recruited) on a single +process to meet the cluster recruitment goals. Scaling the database +is achieved by horizontally expanding the number of processes for +separate roles: Coordinators ~~~~~~~~~~~~ diff --git a/documentation/sphinx/source/authorization.rst b/documentation/sphinx/source/authorization.rst new file mode 100644 index 0000000000..ce25df1057 --- /dev/null +++ b/documentation/sphinx/source/authorization.rst @@ -0,0 +1,124 @@ +############# +Authorization +############# + +.. warning :: Authorization is currently experimental and is not recommended for use in production. + +Introduction +============ + +:ref:`Multi-tenant ` database implies a couple of new concepts that did not previously exist in FoundationDB. +The first is the concept of privilege levels: we have *data-plane clients* whose typical workload is limited to accessing a tenant keyspace. +On the other hand, we have *control-plane clients* or *administrators* who may read or update cluster-wide configurations through system keyspace. +These operations also include creation and deletion of tenants. +The second is access control: with multiple tenant keyspaces, it comes naturally that we would want to restrict database access of a client to a subset of them. + +Privilege Levels +---------------- + +Authorization feature extends FoundationDB's existing TLS policy to distinguish administrators from data-plane clients, +making TLS configuration a prerequisite for enabling authorization. +There are only two privilege levels: *trusted* versus *untrusted* clients. +Trusted clients are authorized to perform any operation that pre-authorization FoundationDB clients used to perform, including those accessing the system keyspace. +Untrusted clients may only request what is necessary to access tenant keyspaces for which they are authorized. +Untrusted clients are blocked from accessing anything in the system keyspace or issuing management operations that modify the cluster in any way. + +In order to be considered a trusted client, a client needs to be :ref:`configured with a valid chain of X.509 certificates and a private key `, +and its certificate chain must be trusted by the server. In other words, a client must successfully complete a mutual TLS authentication. +Additionally, if the server was configured with trusted IP subnets, i.e. run with one or more ``--trusted-subnet-SUBNET_NAME`` followed by a CIDR block describing the subnet, +then the client's IP as seen from the server must belong to at least one of the subnets. + +Choosing to respond with an empty certificate chain during `client authentication `_ marks the client as untrusted. +If the server specifies a list of trusted subnets and the client's server-facing IP is not part of any of the subnets, +then the client is untrusted even if it successfully completes a mutual TLS authentication. + +.. note:: Presenting a bad or untrusted certificate chain causes the server to break the client connection and eventually throttle the client. + It does not let the client connect untrusted. + +Access Control +-------------- + +To restrict untrusted client's database access to a subset of tenant keyspaces, authorization feature allows database administrators +to grant tenant-scoped access in the form of `JSON Web Tokens `_. +Token verification is performed against a set of named public keys written in `JWK Set `_ format. +A token's header part must contain the `key identifier `_ of the public key which shall be used to verify the token itself. +Below is the list of token fields recognized by FoundationDB. +Note that some of the fields are *recognized* by FoundationDB but not *actively used* in enforcing security, pending future implementation. +Those fields are marked as **NOT required**. + + +.. table:: JSON Web Token Fields supported by FoundationDB + :align: left + :widths: auto + + =============== =========== ======== ==================================================== ================================================================================ + Containing Part Field Name Required Purpose Reference + =============== =========== ======== ==================================================== ================================================================================ + Header ``typ`` Yes Type of JSON Web Signature. Must be ``JWT``. `RFC7519 Section 5.1 `_ + Header ``alg`` Yes Algorithm used to generate the signature. Only `RFC7515 Section 4.1.1 `_ + ``ES256`` and ``RS256`` are supported. + Must match the ``alg`` attribute of public key. + Header ``kid`` Yes Name of public key with which to verify the token. `RFC7515 Section 4.1.4 `_ + Must match the ``kid`` attribute of public key. + Claim ``exp`` Yes Timestamp after which token is not accepted. `RFC7519 Section 4.1.4 `_ + Claim ``nbf`` Yes Timestamp before which token is not accepted. `RFC7519 Section 4.1.5 `_ + Claim ``iat`` Yes Timestamp at which token was issued. `RFC7519 Section 4.1.6 `_ + Claim ``tenants`` Yes Tenants names for which token holder is authorized. N/A + Must be an array. + Claim ``iss`` No Issuer of the token. `RFC7519 Section 4.1.1 `_ + Claim ``sub`` No Subject of the token. `RFC7519 Section 4.1.2 `_ + Claim ``aud`` No Intended recipients of the token. Must be an array. `RFC7519 Section 4.1.3 `_ + Claim ``jti`` No String that uniquely identifies a token. `RFC7519 Section 4.1.7 `_ + =============== =========== ======== ==================================================== ================================================================================ + +Public keys with which to verify the token must be serialized in `JWK Set `_ format and stored in a file. +The location of the key set file must be passed as command line argument ``--authorization-public-key-file`` to the ``fdbserver`` executable. +Public keys in the set must be either `RSA `_ public keys +containing ``n`` and ``e`` parameters, each containing `Base64urlUInt `_-encoded modulus and exponent, +or `Elliptic Curve `_ public keys on a ``P-256`` curve, +where ``crv`` parameter is set to ``P-256`` and ``x`` and ``y`` parameters contain +`base64url `_-encoded affine coordinates. +In addition, each public key JSON object in set must contain ``kty`` (set to either ``EC`` or ``RSA``) field to indicate public key algorithm, +along with ``kid``, and ``alg`` fields to be compared against their token header counterparts. +Private keys are strongly recommended against being included in the public key set and, if found, are excluded from consideration. + +.. note:: By design, FoundationDB authorization feature does not support revocation of outstanding tokens. + Use extra caution in signing tokens with long token durations. + +Enabling Clients to use Authorization Tokens +============================================ + +In order to use an untrusted client with an authorization token, a client must be configured to trust the server's CA, +but must not be configured to use the client's own certificates and keys. +More concretely, the client's ``TLS_CA_FILE`` must include the server's root CA certificate, +but the client must not be configured with its own ``TLS_CERTIFICATE_FILE`` or ``TLS_KEY_FILE``, neither programmatically nor by environment variable. +Before performing a tenant data read or update, a client must set ``AUTHORIZATION_TOKEN`` transaction option with the token string as argument. +It is the client's responsibility to keep the token up-to-date, by timely assigning a new token to the transaction object. + +.. note:: The TLS authentication mode of an untrusted client is similar to how typical web browsers connect to TLS-enabled web services. + They authenticate the server using their bundle of trusted root CA certificates, + but they do not authenticate themselves to the server. + +Public Key Rotation +=================== + +FoundationDB's internal public key set automatically refreshes itself based on the key set file's latest content every ``PUBLIC_KEY_FILE_REFRESH_INTERVAL_SECONDS`` seconds. +The in-memory set of public keys does not update unless the key file holds a correct `JWK Set`_. + +Token Caching +============= + +In a single-threaded runtime environment such as FoundationDB, it is important not to let the main thread be overloaded with computationally expensive operations, +such as token signature verification. FoundationDB internally caches the tokens that are considered valid at the time of verification in a fixed-size cache, +whose size may be configured using ``TOKEN_CACHE_SIZE`` knob. + +.. note:: Token cache is independent of the active public key set. Once the token reaches the cache, it is valid until its expiration time, + regardless of any key rotation that takes place thereafter. + +Allowing Untrusted Clients to Access Tenant Data Without Tokens +=============================================================== + +Rolling out a public key distribution infrastructure and an authorization-enabled FoundationDB cluster in lockstep might not be feasible with large scale distributed systems. +To support incremental rollout, authorization feature introduces ``ALLOW_TOKENLESS_TENANT_ACCESS`` boolean knob, +which preserves the TLS-based privilege level policy without untrusted clients having to set authorization tokens to their transactions in order to access tenant data. +With this knob active, any authorization token assigned to the client transaction is simply ignored. diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst index 433a47ce7d..0eb159e8f4 100644 --- a/documentation/sphinx/source/client-testing.rst +++ b/documentation/sphinx/source/client-testing.rst @@ -373,3 +373,302 @@ with the ``multitest`` role: fdbserver -r multitest -f testfile.txt This command will block until all tests are completed. + +########## +API Tester +########## + +Introduction +============ + +API tester is a framework for implementing end-to-end tests of FDB C API, i.e. testing the API on a real +FDB cluster through all layers of the FDB client. Its executable is ``fdb_c_api_tester``, and the source +code is located in ``bindings/c/test/apitester``. The structure of API Tests is similar to that of the +Simulation Tests. The tests are implemented as workloads using FDB API, which are all built into the +``fdb_c_api_tester``. A concrete test configuration is defined as a TOML file, which specifies the +combination of workloads to be executed by the test together with their parameters. The test can be then +executed by passing the TOML file as a parameter to ``fdb_c_api_tester``. + +Since simulation tests rely on the actor model to execute the tests deterministically in single-threaded +mode, they are not suitable for testing various multi-threaded aspects of the FDB client. End-to-end API +tests complement the simulation tests by testing the FDB Client layers above the single-threaded Native +Client. + +- The specific testing goals of the end-to-end tests are: +- Check functional correctness of the Multi-Version Client (MVC) and Thread-Safe Client +- Detecting race conditions. They can be caused by accessing the state of the Native Client from wrong + threads or introducing other shared state without proper synchronization +- Detecting memory management errors. Thread-safe reference counting must be used where necessary. MVC + works with multiple client libraries. Memory allocated by one client library must be also deallocated + by the same library. +- Maintaining interoperability with other client versions. The client functionality is made available + depending on the selected API version. The API changes are correctly adapted. +- Client API behaves correctly in case of cluster upgrades. Database and transaction state is correctly + migrated to the upgraded connections. Pending operations are canceled and successfully retried on the + upgraded connections. + +Implementing a Workload +======================= + +Each workload is declared as a direct or indirect subclass of ``WorkloadBase`` implementing a constructor +with ``WorkloadConfig`` as a parameter and the method ``start()``, which defines the entry point of the +workload. + +``WorkloadBase`` provides a set of methods that serve as building blocks for implementation of a workload: + +.. function:: execTransaction(start, cont, failOnError = true) + + creates and executes an FDB transaction. Here ``start`` is a function that takes a transaction context + as parameter and implements the starting point of the transaction, and ``cont`` is a function implementing + a continuation to be executed after finishing the transaction execution. Transactions are automatically + retried on retryable errors. Transactions are retried by calling the ``start`` function again. In case + of a fatal error, the entire workload is considered as failed unless ``failOnError`` is set to ``false``. + +.. function:: schedule(task) + + schedules a task for asynchronous execution. It is usually used in the continuations to schedule + the next step of the workload. + +.. function:: info(msg) + error(msg) + + are used for logging a message with a tag identifying the workload. Issuing an error message marks + the workload as failed. + +The transaction context provides methods for implementation of the transaction logics: + +.. function:: tx() + + the reference to the FDB transaction object + +.. function:: continueAfter(future, cont, retryOnError = true) + + set a continuation to be executed when the future is ready. The ``retryOnError`` flag controls whether + the transaction should be automatically retried in case the future results in a retriable error. + +.. function:: continueAfterAll(futures, cont) + + takes a vector of futures and sets a continuation to be executed when all of the futures get ready. + The transaction is retried if at least one of the futures results in an error. This method is useful + for handling multiple concurrent reads. + +.. function:: commit() + + commit and finish the transaction. If the commit is successful, the execution proceeds to the + continuation of ``execTransaction()``. In case of a retriable error the transaction is + automatically retried. A fatal error results in a failure of the workoad. + + +.. function:: done() + + finish the transaction without committing. This method should be used to finish read transactions. + The transaction gets destroyed and execution proceeds to the continuation of ``execTransaction()``. + Each transaction must be finished either by ``commit()`` or ``done()``, because otherwise + the framework considers that the transaction is still being executed, so it won't destroy it and + won't call the continuation. + +.. function:: onError(err) + + Handle an error: restart the transaction in case of a retriable error, otherwise fail the workload. + This method is typically used in the continuation of ``continueAfter`` called with + ``retryOnError=false`` as a fallback to the default error handling. + +A workload execution ends automatically when it is marked as failed or its last continuation does not +schedule any new task or transaction. + +The workload class should be defined in the namespace FdbApiTester. The file name convention is +``Tester{Name}Workload.cpp`` so that we distinguish them from the source files of simulation workloads. + +Basic Workload Example +====================== + +The code below implements a workload that consists of only two transactions. The first one sets a +randomly generated key to a randomly generated value, and the second one reads the key and checks if +the returned value matches the written one. + +.. literalinclude:: ../../../bindings/c/test/apitester/TesterExampleWorkload.cpp + :language: C++ + :lines: 21- + +The workload is implemented in the method ``setAndGet``. It generates a random key and a random value +and executes a transaction that writes that key-value pair and commits. In the continuation of the +first ``execTransaction`` call, we execute the second transaction that reads the same key. The read +operation returns a future. So we call ``continueAfter`` to set a continuation for that future. In the +continuation we check if the returned value matches the written one and finish the transaction by +calling ``ctx->done()``. After completing the second transaction we execute the continuation passed +as parameter to the ``setAndGet`` method by the start method. In this case it is ``NO_OP_TASK``, which +does nothing and so finishes the workload. + +Finally, we declare an instance ``WorkloadFactory`` to register this workload with the name ``SetAndGet``. + +Note that we use ``workloadId`` as a key prefix. This is necessary for isolating the key space of this +workload, because the framework may be instructed to create multiple instances of the ``SetAndGet`` +workload. If we do not isolate the key space, another workload can write a different value for the +same key and so break the assumption of the test. + +The workload is implemented using the internal C++ API, implemented in ``fdb_api.hpp``. It introduces +a set of classes representing the FDB objects (transactions, futures, etc.). These classes provide C++-style +methods wrapping FDB C API calls and automate memory management by means of reference counting. + +Implementing Control Structures +=============================== + +Our basic workload executes just 2 transactions, but in practice we want to have workloads that generate +multiple transactions. The following code demonstrates how we can modify our basic workload to generate +multiple transactions in a loop. + +.. code-block:: C++ + + class SetAndGetWorkload : public WorkloadBase { + public: + ... + int numIterations; + int iterationsLeft; + + SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) { + keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId)); + numIterations = config.getIntOption("numIterations", 1000); + } + + void start() override { + iterationsLeft = numIterations; + setAndGetLoop(); + } + + void setAndGetLoop() { + if (iterationsLeft == 0) { + return; + } + iterationsLeft--; + setAndGet([this]() { setAndGetLoop(); }); + } + ... + } + +We introduce a workload parameter ``numIterations`` to specify the number of iterations. If not specified +in the test configuration it defaults to 1000. + +The method ``setAndGetLoop`` implements the loop that decrements iterationsLeft counter until it reaches 0 +and each iteration calls setAndGet with a continuation that returns the execution to the loop. As you +can see we don't need any change in ``setAndGet``, just call it with another continuation. + +The pattern of passing a continuation as a parameter also can be used to decompose the workload into a +sequence of steps. For example, we can introduce setup and cleanUp steps to our workload and modify the +``setAndGetLoop`` to make it composable with an arbitrary continuation: + +.. code-block:: C++ + + void start() override { + setup([this](){ + iterationsLeft = numIterations; + setAndGetLoop([this](){ + cleanup(NO_OP_TASK); + }); + }); + } + + void setAndGetLoop(TTaskFct cont) { + if (iterationsLeft == 0) { + schedule(cont); + } + iterationsLeft--; + setAndGet([this, cont]() { setAndGetLoop(cont); }); + } + + void setup(TTaskFct cont) { ... } + + void cleanup(TTaskFct cont) { ... } + +Note that we call ``schedule(cont)`` in ``setAndGetLoop`` instead of calling the continuation directly. +In this way we avoid keeping ``setAndGetLoop`` in the call stack, when executing the next step. + +Subclassing ApiWorkload +======================= + +``ApiWorkload`` is an abstract subclass of ``WorkloadBase`` that provides a framework for a typical +implementation of API test workloads. It implements a workflow consisting of cleaning up the key space +of the workload, populating it with newly generated data and then running a loop consisting of random +database operations. The concrete subclasses of ``ApiWorkload`` are expected to override the method +``randomOperation`` with an implementation of concrete random operations. + +The ``ApiWorkload`` maintains a local key-value store that mirrors the part of the database state +relevant to the workload. A successful database write operation should be followed by a continuation +that performs equivalent changes in the local store, and the results of a database read operation should +be validated against the values from the local store. + +Test Configuration +================== + +A concrete test configuration is specified by a TOML file. The file must contain one ``[[test]]`` section +specifying the general settings for test execution followed by one or more ``[[test.workload]]`` +configuration sessions, specifying the workloads to be executed and their parameters. The specified +workloads are started all at once and executed concurrently. + +The ``[[test]]`` section can contain the following options: + +- ``title``: descriptive title of the test +- ``multiThreaded``: enable multi-threading (default: false) +- ``minFdbThreads`` and ``maxFdbThreads``: the number of FDB (network) threads to be randomly selected + from the given range (default: 1-1). Used only if ``multiThreaded=true``. It is also important to use + multiple database instances to make use of the multithreading. +- ``minDatabases`` and ``maxDatabases``: the number of database instances to be randomly selected from + the given range (default 1-1). The transactions of all workloads are randomly load-balanced over the + pool of database instances. +- ``minClients`` and ``maxClients``: the number of clients, i.e. instances of each workload, to be + randomly selected from the given range (default 1-8). +- ``minClientThreads`` and ``maxClientThreads``: the number of client threads, i.e. the threads used + for execution of the workload, to be randomly selected from the given range (default 1-1). +- ``blockOnFutures``: use blocking waits on futures instead of scheduling future callbacks asynchronously + (default: false) +- ``buggify``: Enable client-side failure injection (default: false) +- ``databasePerTransaction``: Create a separate database instance for each transaction (default: false). + It is a special mode useful for testing bugs related to creation and destruction of database instances. +- ``fdbCallbacksOnExternalThreads``: Enables the option ``FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS`` + causting the callbacks of futures to be executed directly on the threads of the external FDB clients + rather than on the thread of the local FDB client. + +The workload section ``[[test.workload]]`` must contain the attribute name matching the registered name +of the workload to be executed. Other options are workload-specific. + +The subclasses of the ``ApiWorkload`` inherit the following configuration options: + +- ``minKeyLength`` and ``maxKeyLength``: the size range of randomly generated keys (default: 1-64) +- ``minValueLength`` and ``maxValueLength``: the size range of randomly generated values + (default: 1-1000) +- ``maxKeysPerTransaction``: the maximum number of keys per transaction (default: 50) +- ``initialSize``: the number of key-value pairs in the initially populated database (default: 1000) +- ``readExistingKeysRatio``: the probability of choosing an existing key for read operations + (default: 0.9) +- ``numRandomOperations``: the number of random operations to be executed per workload (default: 1000) +- ``runUntilStop``: run the workload indefinitely until the stop command is received (default: false). + This execution mode in upgrade tests and other scripted tests, where the workload needs to + be generated continously until completion of the scripted test. +- ``numOperationsForProgressCheck``: the number of operations to be performed to confirm a progress + check (default: 10). This option is used in combination with ``runUntilStop``. Progress checks are + initiated by a test script to check if the client workload is successfully progressing after a + cluster change. + +Executing the Tests +=================== + +The ``fdb_c_api_tester`` executable takes a single TOML file as a parameter and executes the test +according to its specification. Before that we must create a FDB cluster and pass its cluster file as +a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an +external client library. + +For example, we can create a temporary cluster and use it for execution of one of the existing API tests: + +.. code-block:: bash + + ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \ + ${buildDir}/bin/fdb_c_api_tester \ + --cluster-file @CLUSTER_FILE@ \ + --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \ + --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml + +The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part +of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``: + +.. code-block:: bash + + ctest -R fdb_c_api_tests -VV diff --git a/documentation/sphinx/source/command-line-interface.rst b/documentation/sphinx/source/command-line-interface.rst index 09969a0640..9c51196a35 100644 --- a/documentation/sphinx/source/command-line-interface.rst +++ b/documentation/sphinx/source/command-line-interface.rst @@ -64,7 +64,7 @@ The ``commit`` command commits the current transaction. Any sets or clears execu configure --------- -The ``configure`` command changes the database configuration. Its syntax is ``configure [new|tss] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [grv_proxies=] [commit_proxies=] [resolvers=] [logs=] [count=] [perpetual_storage_wiggle=] [perpetual_storage_wiggle_locality=<:|0>] [storage_migration_type={disabled|aggressive|gradual}] [tenant_mode={disabled|optional_experimental|required_experimental}]``. +The ``configure`` command changes the database configuration. Its syntax is ``configure [new|tss] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [grv_proxies=] [commit_proxies=] [resolvers=] [logs=] [count=] [perpetual_storage_wiggle=] [perpetual_storage_wiggle_locality=<:|0>] [storage_migration_type={disabled|aggressive|gradual}] [tenant_mode={disabled|optional_experimental|required_experimental}] [encryption_at_rest_mode={aes_256_ctr|disabled}]``. The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When ``new`` is used, both a redundancy mode and a storage engine must be specified. @@ -153,13 +153,6 @@ If ``description=`` is specified, the description field in the cluster fil For more information on setting the cluster description, see :ref:`configuration-setting-cluster-description`. -createtenant ------------- - -The ``createtenant`` command is used to create new tenants in the cluster. Its syntax is ``createtenant ``. - -The tenant name can be any byte string that does not begin with the ``\xff`` byte. If the tenant already exists, ``fdbcli`` will report an error. - defaulttenant ------------- @@ -167,13 +160,6 @@ The ``defaulttenant`` command configures ``fdbcli`` to run its commands without The active tenant cannot be changed while a transaction (using ``begin``) is open. -deletetenant ------------- - -The ``deletetenant`` command is used to delete tenants from the cluster. Its syntax is ``deletetenant ``. - -In order to delete a tenant, it must be empty. To delete a tenant with data, first clear that data using the ``clear`` command. If the tenant does not exist, ``fdbcli`` will report an error. - exclude ------- @@ -231,33 +217,8 @@ The ``getrangekeys`` command fetches keys in a range. Its syntax is ``getrangeke Note that :ref:`characters can be escaped ` when specifying keys (or values) in ``fdbcli``. -gettenant ---------- - -The ``gettenant`` command fetches metadata for a given tenant and displays it. Its syntax is ``gettenant [JSON]``. - -Included in the output of this command are the ``id`` and ``prefix`` assigned to the tenant. If the tenant does not exist, ``fdbcli`` will report an error. If ``JSON`` is specified, then the output will be written as a JSON document:: - - { - "tenant": { - "id": 0, - "prefix": { - "base64": "AAAAAAAAAAU=", - "printable": "\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x05", - } - }, - "type": "success" - } - -In the event of an error, the output will include an error message:: - - { - "error": "...", - "type": "error" - } - - getversion - ---------- +getversion +---------- The ``getversion`` command fetches the current read version of the cluster or currently running transaction. @@ -346,13 +307,6 @@ Attempts to kill all specified processes. Each address should include the IP and Attempts to kill all known processes in the cluster. -listtenants ------------ - -The ``listtenants`` command prints the names of tenants in the cluster. Its syntax is ``listtenants [BEGIN] [END] [LIMIT]``. - -By default, the ``listtenants`` command will print up to 100 entries from the entire range of tenants. A narrower sub-range can be printed using the optional ``[BEGIN]`` and ``[END]`` parameters, and the limit can be changed by specifying an integer ``[LIMIT]`` parameter. - lock ---- @@ -417,13 +371,6 @@ heap Enables heap profiling for the specified process. -renametenant ------------- - -The ``renametenant`` command can rename an existing tenant to a new name. Its syntax is ``renametenant ``. - -This command requires that ``OLD_NAME`` is a tenant that already exists on the cluster, and that ``NEW_NAME`` is not already a name of a tenant in the cluster. - reset ----- @@ -484,6 +431,141 @@ status json .. _cli-throttle: +tenant +------ + +The ``tenant`` command is used to view and manage the tenants in a cluster. The ``tenant`` command has the following subcommands: + +create +^^^^^^ + +``tenant create [tenant_group=]`` + +Creates a new tenant in the cluster. + +``NAME`` - The desired name of the tenant. The name can be any byte string that does not begin with the ``\xff`` byte. + +``TENANT_GROUP`` - The tenant group the tenant will be placed in. + +delete +^^^^^^ + +``tenant delete `` + +Deletes a tenant from the cluster. The tenant must be empty. + +``NAME`` - the name of the tenant to delete. + +list +^^^^ + +``tenant list [BEGIN] [END] [LIMIT]`` + +Lists the tenants present in the cluster. + +``BEGIN`` - the first tenant to list. Defaults to the empty tenant name ``""``. + +``END`` - the exclusive end tenant to list. Defaults to ``\xff\xff``. + +``LIMIT`` - the number of tenants to list. Defaults to 100. + +get +^^^ + +``tenant get [JSON]`` + +Prints the metadata for a tenant. + +``NAME`` - the name of the tenant to print. + +``JSON`` - if specified, the output of the command will be printed in the form of a JSON string:: + + { + "tenant": { + "id": 0, + "prefix": { + "base64": "AAAAAAAAAAU=", + "printable": "\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x05", + } + }, + "type": "success" + } + +In the event of an error, the JSON output will include an error message:: + + { + "error": "...", + "type": "error" + } + +configure +^^^^^^^^^ + +``tenant configure <[unset] tenant_group[=GROUP_NAME]>`` + +Changes the configuration of a tenant. + +``TENANT_NAME`` - the name of the tenant to reconfigure. + +The following tenant fields can be configured: + +``tenant_group`` - changes the tenant group a tenant is assigned to. If ``unset`` is specified, the tenant will be configured to not be in a group. Otherwise, ``GROUP_NAME`` must be specified to the new group that the tenant should be made a member of. + +rename +^^^^^^ + +``tenant rename `` + +Changes the name of an existing tenant. + +``OLD_NAME`` - the name of the tenant being renamed. + +``NEW_NAME`` - the desired name of the tenant. This name must not already be in use. + + +tenantgroup +----------- + +The ``tenantgroup`` command is used to view details about the tenant groups in a cluster. The ``tenantgroup`` command has the following subcommands: + +list +^^^^ + +``tenantgroup list [BEGIN] [END] [LIMIT]`` + +Lists the tenant groups present in the cluster. + +``BEGIN`` - the first tenant group to list. Defaults to the empty tenant group name ``""``. + +``END`` - the exclusive end tenant group to list. Defaults to ``\xff\xff``. + +``LIMIT`` - the number of tenant groups to list. Defaults to 100. + +get +^^^ + +``tenantgroup get [JSON]`` + +Prints the metadata for a tenant group. + +``NAME`` - the name of the tenant group to print. + +``JSON`` - if specified, the output of the command will be printed in the form of a JSON string:: + + { + "tenant_group": { + "assigned_cluster": "cluster1", + }, + "type": "success" + } + +In the event of an error, the JSON output will include an error message:: + + { + "error": "...", + "type": "error" + } + throttle -------- diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 699c811139..5d52d40910 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -416,6 +416,9 @@ FoundationDB will never use processes on the same machine for the replication of ``three_data_hall`` mode FoundationDB stores data in triplicate, with one copy on a storage server in each of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration enables the cluster to remain available after losing a single data hall and one machine in another data hall. +``three_data_hall_fallback`` mode + FoundationDB stores data in duplicate, with one copy each on a storage server in two of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration is similar to ``three_data_hall``, differing only in that data is stored on two instead of three replicas. This configuration is useful to unblock data distribution when a data hall becomes temporarily unavailable. Because ``three_data_hall_fallback`` reduces the redundancy level to two, it should only be used as a temporary measure to restore cluster health during a datacenter outage. + Datacenter-aware mode --------------------- diff --git a/documentation/sphinx/source/data-modeling.rst b/documentation/sphinx/source/data-modeling.rst index 146b006809..5972c5110a 100644 --- a/documentation/sphinx/source/data-modeling.rst +++ b/documentation/sphinx/source/data-modeling.rst @@ -1,7 +1,6 @@ .. default-domain:: py .. default-domain:: py .. highlight:: python -.. module:: fdb .. Required substitutions for api-common.rst.inc diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index 3bf5ec30c0..c889e4909b 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -1,7 +1,6 @@ .. default-domain:: py .. default-domain:: py .. highlight:: python -.. module:: fdb .. Required substitutions for api-common.rst.inc diff --git a/documentation/sphinx/source/global-configuration.rst b/documentation/sphinx/source/global-configuration.rst index aad4c31d93..663ad26eb4 100644 --- a/documentation/sphinx/source/global-configuration.rst +++ b/documentation/sphinx/source/global-configuration.rst @@ -82,10 +82,10 @@ Values must always be encoded according to the :ref:`api-python-tuple-layer`. // In GlobalConfig.actor.h extern const KeyRef myGlobalConfigKey; // In GlobalConfig.actor.cpp - const KeyRef myGlobalConfigKey = LiteralStringRef("config/key"); + const KeyRef myGlobalConfigKey = "config/key"_sr; // When you want to set the value.. - Tuple value = Tuple().appendDouble(1.5); + Tuple value = Tuple::makeTuple((double)1.5); FDBTransaction* tr = ...; tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); diff --git a/documentation/sphinx/source/index.rst b/documentation/sphinx/source/index.rst index 40c7a76279..167bebda43 100644 --- a/documentation/sphinx/source/index.rst +++ b/documentation/sphinx/source/index.rst @@ -50,6 +50,7 @@ The latest changes are detailed in :ref:`release-notes`. The documentation has t :hidden: local-dev + internal-dev-tools why-foundationdb technical-overview client-design diff --git a/documentation/sphinx/source/internal-dev-tools.rst b/documentation/sphinx/source/internal-dev-tools.rst new file mode 100644 index 0000000000..ea80947312 --- /dev/null +++ b/documentation/sphinx/source/internal-dev-tools.rst @@ -0,0 +1,58 @@ +################## +Internal Dev Tools +################## + +Code Probes +=========== + +Code probes are a mechanism in FDB to prove that certain code-paths are being tested under the right conditions. They differ from code coverage in multiple ways (explained below). + +The general format of a code probe is: + +.. code-block:: C++ + + CODE_PROBE(, "Comment", [annotations...]); + +A simple example of a code probe could look as follows: + +.. code-block:: C++ + + CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery", probe::context::sim2); + +On a very high level, the above code will indicate that whenever this line is executed and ``self->forceRecovery`` is ``true``, we ran into some interesting case. In addition this probe is also annotated with ``probe::context::sim2``. This indicates that we expect this code to be eventually hit in simulation. + +By default, FDB simply will write a trace-line when this code is hit and the condition is ``true``. If the code is never hit, the simulator will, at the end of the run, print the code probe but set the ``covered`` field to ``false``. This all happens in the context of a single simulation run (``fdbserver`` doesn't have a concept of ensembles). This information is written into the log file. ``TestHarness`` (see below) will then use this information to write code probe statistics to the ensemble in the Joshua cluster (if the test is run in Joshua). + +We expect that ALL code probes will be hit in a nightly run. In the future we can potentially use this feature for other things (like instructing the simulator to do an extensive search starting when one of these probes is being hit). + +In addition to ``context`` annotations, users can also define and pass assertions. For example: + +.. code-block:: C++ + + CODE_PROBE(condition, "Some comment", assert::simOnly); + +These will add an assertion to the code. In addition to that, the simulator will not print missed code probes that asserted that the probe won't be hit in simulation. + +Test Harness +============ + +TestHarness is our primary testing tool. It has multiple jobs: + +* *Running*: It can run a test in Joshua. +* *Statistics*: It will choose a test to run based on previous runs (within the same ensemble) spent CPU time for each test. It does that by writing statistics about the test at the end of each run. +* *Reporting*: After an ensemble has finished (or while it is running), ``TestHarness`` can be used to generate a report in ``xml`` or ``json``. + +Test Harness can be found in the FDB source repository under ``contrib/TestHarness2``. It has a weak dependency to `joshua `_ (if Test Harness can find joshua it will report back about failed tests, otherwise it will just print out general statistics about the ensemble). Joshua will call Test Harness as follows: + +.. code-block:: shell + + python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} + +Here the seed is a random number generated by joshua and ``OLDBINDIR`` is a directory path where the old fdb binaries can be found (this is needed for restart tests). If one wants to retry a test they can pass the previous joshua seed, a directory path that has *exactly* the same content as ``OLDBINARYDIR``, plus the reported statistics to the test harness app. This should then re-run the same code as before. + +In order to figure out what command line arguments ``test_harness.app`` (and ``test_harness.results``) accepts, one can check the contents of ``contrib/TestHarness2/test_harness/config.py``. + +Reporting +--------- + +After a joshua ensemble completed, ``test_harness.results`` can be used in order to get a report on the ensemble. This will include, by default, a list of all failed tests (similar to ``joshua tail --errors``, though in a more human readable file). For completed ensemble it will also print code probes that weren't hit often enough. An ensemble is considered to be successful if no simulation runs completed with an error AND all code probes have been hit sufficiently often. diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 79cad80a8f..bd4f1388d9 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -379,7 +379,9 @@ "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -400,7 +402,9 @@ "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -523,7 +527,8 @@ "duplicate_mutation_streams", "duplicate_mutation_fetch_timeout", "primary_dc_missing", - "fetch_primary_dc_timeout" + "fetch_primary_dc_timeout", + "fetch_storage_wiggler_stats_timeout" ] }, "issues":[ @@ -598,7 +603,7 @@ "counter":0, "roughness":0.0 }, - "memory_errors":{ // measures number of proxy_memory_limit_exceeded errors + "memory_errors":{ // measures number of (commit/grv)_proxy_memory_limit_exceeded errors "hz":0.0, "counter":0, "roughness":0.0 @@ -785,6 +790,11 @@ "disabled", "optional_experimental", "required_experimental" + ]}, + "encryption_at_rest_mode": { + "$enum":[ + "disabled", + "aes_256_ctr" ]} }, "data":{ diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst index 5eb404bbd4..ed550cbee7 100644 --- a/documentation/sphinx/source/mr-status.rst +++ b/documentation/sphinx/source/mr-status.rst @@ -131,6 +131,9 @@ min_free_space_ratio Running out of space (approaching 5% limit). log_server_min_free_space Log server running out of space (approaching 100MB limit). log_server_min_free_space_ratio Log server running out of space (approaching 5% limit). storage_server_durability_lag Storage server durable version falling behind. +storage_server_list_fetch_failed Unable to fetch storage server list. +blob_worker_lag Blob worker granule version falling behind. +blob_worker_missing No blob workers are reporting metrics. =================================== ==================================================== The JSON path ``cluster.qos.throttled_tags``, when it exists, is an Object containing ``"auto"`` , ``"manual"`` and ``"recommended"``. The possible fields for those object are in the following table: diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst index d9312f3781..9dad6e05af 100644 --- a/documentation/sphinx/source/release-notes/release-notes-710.rst +++ b/documentation/sphinx/source/release-notes/release-notes-710.rst @@ -2,6 +2,61 @@ Release Notes ############# +7.1.23 +====== +* Same as 7.1.22 release with AVX enabled. + +7.1.22 +====== +* Released with AVX disabled. +* Added new latency samples for GetValue, GetRange, QueueWait, and VersionWait in storage servers. `(PR #8215) `_ +* Fixed a rare partial data write for TLogs. `(PR #8210) `_ +* Added HTTP proxy support for backup agents. `(PR #8193) `_ +* Fixed a memory bug of secondary queries in index prefetch. `(PR #8195) `_, `(PR #8190) `_ +* Introduced STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT knob to recreate SS at io_timeout errors. `(PR #8123) `_ +* Fixed two TLog stopped bugs and a CC leader replacement bug. `(PR #8081) `_ +* Added back RecoveryAvailable trace event for status's seconds_since_last_recovered field. `(PR #8068) `_ + +7.1.21 +====== +* Same as 7.1.20 release with AVX enabled. + +7.1.20 +====== +* Released with AVX disabled. +* Fixed missing localities for fdbserver that can cause cross DC calls among storage servers. `(PR #7995) `_ +* Removed extremely spammy trace event in FetchKeys and fixed transaction_profiling_analyzer.py. `(PR #7934) `_ +* Fixed bugs when GRV proxy returns an error. `(PR #7860) `_ + +7.1.19 +====== +* Same as 7.1.18 release with AVX enabled. + +7.1.18 +====== +* Released with AVX disabled. +* Added knobs for the minimum and the maximum of the Ratekeeper's default priority. `(PR #7820) `_ +* Fixed bugs in ``getRange`` of the special key space. `(PR #7778) `_, `(PR #7720) `_ +* Added debug ID for secondary queries in index prefetching. `(PR #7755) `_ +* Changed hostname resolving to prefer IPv6 addresses. `(PR #7750) `_ +* Added more transaction debug events for prefetch queries. `(PR #7732) `_ + +7.1.17 +====== +* Same as 7.1.16 release with AVX enabled. + +7.1.16 +====== +* Released with AVX disabled. +* Fixed a crash bug when cluster controller shuts down. `(PR #7706) `_ +* Fixed a storage server failure when getReadVersion returns an error. `(PR #7688) `_ +* Fixed unbounded status json generation. `(PR #7680) `_ +* Fixed ScopeEventFieldTypeMismatch error for TLogMetrics. `(PR #7640) `_ +* Added getMappedRange latency metrics. `(PR #7632) `_ +* Fixed a version vector performance bug due to not updating client side tag cache. `(PR #7616) `_ +* Fixed DiskReadSeconds and DiskWriteSeconds calculation in ProcessMetrics. `(PR #7609) `_ +* Added Rocksdb compression and data size stats. `(PR #7596) `_ + 7.1.15 ====== * Same as 7.1.14 release with AVX enabled. @@ -58,7 +113,7 @@ Release Notes * Added support of the reboot command in go bindings. `(PR #7270) `_ * Fixed several issues in profiling special keys using GlobalConfig. `(PR #7120) `_ * Fixed a stuck transaction system bug due to inconsistent recovery transaction version. `(PR #7261) `_ -* Fixed a unknown_error crash due to not resolving hostnames. `(PR #7254) `_ +* Fixed an unknown_error crash due to not resolving hostnames. `(PR #7254) `_ * Fixed a heap-use-after-free bug. `(PR #7250) `_ * Fixed a performance issue that remote TLogs are sending too many pops to log routers. `(PR #7235) `_ * Fixed an issue that SharedTLogs are not displaced and leaking disk space. `(PR #7246) `_ diff --git a/documentation/sphinx/source/release-notes/release-notes-720.rst b/documentation/sphinx/source/release-notes/release-notes-720.rst index 1496b27c4f..c0093232b4 100644 --- a/documentation/sphinx/source/release-notes/release-notes-720.rst +++ b/documentation/sphinx/source/release-notes/release-notes-720.rst @@ -20,6 +20,8 @@ Fixes ----- * In ``fdbcli``, integer options are now expressed as integers rather than byte strings (e.g. ``option on TIMEOUT 1000``). `(PR #7571) `_ +* Fixed the bug in ``ConflictingKeysImpl::getRange`` which happens when an underlying conflicting range contains the read range. Added additional test coverage for validating random ``getRange`` results from special keys. `(PR #7597) `_ +* Fixed the bug in ``SpecialKeyRangeAsyncImpl::getRange`` that the local cache is updated incorrectly after a cross-module read range if it touched more than one ``SpecialKeyRangeAsyncImpl`` in resolving key selectors. Extended the ``SpecialKeySpaceCorrectness`` workload to catch the bug. `(PR #7671) `_ Status ------ diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index 1a278d19b4..75877d922b 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -22,16 +22,15 @@ Each special key that existed before api version 630 is its own module. These ar #. ``\xff\xff/cluster_file_path`` - See :ref:`cluster file client access ` #. ``\xff\xff/status/json`` - See :doc:`Machine-readable status ` +#. ``\xff\xff/worker_interfaces`` - key as the worker's network address and value as the serialized ClientWorkerInterface, not transactional + Prior to api version 630, it was also possible to read a range starting at ``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli, but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``. -Api version 630 includes two new modules: +Api version 630 includes three new modules: #. ``\xff\xff/transaction/`` - information about the current transaction #. ``\xff\xff/metrics/`` - various metrics, not transactional - -Api version 720 includes one new module: - #. ``\xff\xff/clusterId`` - returns an immutable unique ID for a cluster Transaction module @@ -209,6 +208,8 @@ that process, and wait for necessary data to be moved away. #. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/tenant/map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ````. Clearing a key in this range will delete the tenant with name ````. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. +#. ``\xff\xff/management/tenant/rename/`` Read/write. Setting a key in this range to an unused tenant name will result in the tenant with the name ```` to be renamed to the value provided. If the rename operation is a transaction retried in a loop, it is possible for the rename to be applied twice, in which case ``tenant_not_found`` or ``tenant_already_exists`` errors may be returned. This can be avoided by checking for the tenant's existence first. +#. ``\xff\xff/management/options/worker_interfaces/verify`` Read/write. Setting this key will add a verification phase in reading ``\xff\xff/worker_interfaces``. Setting this key only has an effect in the current transaction and is not persisted on commit. Try to establish connections with every worker from the list returned by Cluster Controller and only return those workers that the client can connect to. This option is now only used in fdbcli commands ``kill``, ``suspend`` and ``expensive_data_check`` to populate the worker list. An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or @@ -275,7 +276,6 @@ Deprecated Keys Listed below are the special keys that have been deprecated. Special key(s) will no longer be accessible when the client specifies an API version equal to or larger than the version where they were deprecated. Clients specifying older API versions will be able to continue using the deprecated key(s). #. ``\xff\xff/management/profiling/`` Deprecated as of API version 720. The corresponding functionalities are now covered by the global configuration module. For details, see :doc:`global-configuration`. Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``. -#. ``\xff\xff/management/tenant_map/`` Removed as of API version 720 and renamed to ``\xff\xff/management/tenant/map/``. Versioning ========== diff --git a/documentation/sphinx/source/tenants.rst b/documentation/sphinx/source/tenants.rst index d22603b20e..07bd7b2a42 100644 --- a/documentation/sphinx/source/tenants.rst +++ b/documentation/sphinx/source/tenants.rst @@ -2,6 +2,8 @@ Tenants ####### +.. _multi-tenancy: + .. warning :: Tenants are currently experimental and are not recommended for use in production. FoundationDB provides a feature called tenants that allow you to configure one or more named transaction domains in your cluster. A transaction domain is a key-space in which a transaction is allowed to operate, and no tenant operations are allowed to use keys outside the tenant key-space. Tenants can be useful for managing separate, unrelated use-cases and preventing them from interfering with each other. They can also be helpful for defining safe boundaries when moving a subset of data between clusters. @@ -49,7 +51,7 @@ All operations performed within a tenant transaction will occur within the tenan Raw access ---------- -When operating in the tenant mode ``required_experimental``, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant. +When operating in the tenant mode ``required_experimental`` or using a metacluster, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant. .. note :: Setting the ``READ_SYSTEM_KEYS`` or ``ACCESS_SYSTEM_KEYS`` options implies ``RAW_ACCESS`` for your transaction. diff --git a/documentation/sphinx/source/tls.rst b/documentation/sphinx/source/tls.rst index 3fb4a08d0c..dcb0c2c930 100644 --- a/documentation/sphinx/source/tls.rst +++ b/documentation/sphinx/source/tls.rst @@ -126,11 +126,11 @@ Default Values Certificate file default location ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The default behavior when the certificate or key file is not specified is to look for a file named ``fdb.pem`` in the current working directory. If this file is not present, an attempt is made to load a file from a system-dependent location as follows: +The default behavior when the certificate or key file is not specified is to look for files named ``cert.pem`` or ``key.pem`` respectively, in system-dependent locations as follows: -* Linux: ``/etc/foundationdb/fdb.pem`` -* macOS: ``/usr/local/etc/foundationdb/fdb.pem`` -* Windows: ``C:\ProgramData\foundationdb\fdb.pem`` +* Linux: ``/etc/foundationdb/cert.pem`` and ``/etc/foundationdb/key.pem`` +* macOS: ``/usr/local/etc/foundationdb/cert.pem`` and ``/usr/local/etc/foundationdb/key.pem`` +* Windows: ``C:\ProgramData\foundationdb\cert.pem`` and ``C:\ProgramData\foundationdb\key.pem`` Default Peer Verification ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -214,9 +214,12 @@ Certificate creation If your organization already makes use of certificates for access control and securing communications, you should ask your security expert for organizational procedure for obtaining and verifying certificates. If the goal of enabling TLS is to make sure that only known machines can join or access the FoundationDB cluster and for securing communications, then creating your own certificates can serve these purposes. -The following set of commands uses the OpenSSL command-line tools to create a self-signed certificate and private key. The certificate is then joined with the private key in the output ``fdb.pem`` file:: +The following set of commands uses the OpenSSL command-line tools to create a self-signed certificate and private key:: + + user@host:> openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout key.pem -out cert.pem + +Optionally, the certificate can be joined with the private key as supplied as both certificate and key files:: - user@host:> openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout private.key -out cert.crt user@host:> cat cert.crt private.key > fdb.pem Peer verification diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp index 9d980ff3d6..245e6d09e3 100644 --- a/documentation/tutorial/tutorial.actor.cpp +++ b/documentation/tutorial/tutorial.actor.cpp @@ -478,7 +478,7 @@ ACTOR Future fdbClient() { state Transaction tx(db); state std::string keyPrefix = "/tut/"; state Key startKey; - state KeyRef endKey = LiteralStringRef("/tut0"); + state KeyRef endKey = "/tut0"_sr; state int beginIdx = 0; loop { try { @@ -494,7 +494,7 @@ ACTOR Future fdbClient() { RangeResult range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100)); for (int i = 0; i < 10; ++i) { Key k = Key(keyPrefix + std::to_string(beginIdx + deterministicRandom()->randomInt(0, 100))); - tx.set(k, LiteralStringRef("foo")); + tx.set(k, "foo"_sr); } wait(tx.commit()); std::cout << "Committed\n"; diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index ab4bf0fd3d..24601308e1 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "flow/ApiVersion.h" #include "fmt/format.h" #include "fdbbackup/BackupTLSConfig.h" #include "fdbclient/JsonBuilder.h" @@ -36,6 +37,7 @@ #include "flow/genericactors.actor.h" #include "flow/TLSConfig.actor.h" +#include "fdbclient/DatabaseContext.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/Status.h" @@ -903,12 +905,12 @@ CSimpleOpt::SOption g_rgDBPauseOptions[] = { SO_END_OF_OPTIONS }; -const KeyRef exeAgent = LiteralStringRef("backup_agent"); -const KeyRef exeBackup = LiteralStringRef("fdbbackup"); -const KeyRef exeRestore = LiteralStringRef("fdbrestore"); -const KeyRef exeFastRestoreTool = LiteralStringRef("fastrestore_tool"); // must be lower case -const KeyRef exeDatabaseAgent = LiteralStringRef("dr_agent"); -const KeyRef exeDatabaseBackup = LiteralStringRef("fdbdr"); +const KeyRef exeAgent = "backup_agent"_sr; +const KeyRef exeBackup = "fdbbackup"_sr; +const KeyRef exeRestore = "fdbrestore"_sr; +const KeyRef exeFastRestoreTool = "fastrestore_tool"_sr; // must be lower case +const KeyRef exeDatabaseAgent = "dr_agent"_sr; +const KeyRef exeDatabaseBackup = "fdbdr"_sr; extern const char* getSourceVersion(); @@ -927,7 +929,7 @@ void parentWatcher(void* parentHandle) { static void printVersion() { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); printf("source version %s\n", getSourceVersion()); - printf("protocol %llx\n", (long long)currentProtocolVersion.version()); + printf("protocol %llx\n", (long long)currentProtocolVersion().version()); } static void printBuildInformation() { @@ -1349,7 +1351,7 @@ ProgramExe getProgramType(std::string programExe) { } #endif // For debugging convenience, remove .debug suffix if present. - if (StringRef(programExe).endsWith(LiteralStringRef(".debug"))) + if (StringRef(programExe).endsWith(".debug"_sr)) programExe = programExe.substr(0, programExe.size() - 6); // Check if backup agent @@ -1854,11 +1856,7 @@ ACTOR Future submitDBBackup(Database src, std::string tagName) { try { state DatabaseBackupAgent backupAgent(src); - - // Backup everything, if no ranges were specified - if (backupRanges.size() == 0) { - backupRanges.push_back_deep(backupRanges.arena(), normalKeys); - } + ASSERT(!backupRanges.empty()); wait(backupAgent.submitBackup( dest, KeyRef(tagName), backupRanges, StopWhenDone::False, StringRef(), StringRef(), LockDB::True)); @@ -1904,6 +1902,7 @@ ACTOR Future submitBackup(Database db, int initialSnapshotIntervalSeconds, int snapshotIntervalSeconds, Standalone> backupRanges, + bool encryptionEnabled, std::string tagName, bool dryRun, WaitForComplete waitForCompletion, @@ -1912,11 +1911,7 @@ ACTOR Future submitBackup(Database db, IncrementalBackupOnly incrementalBackupOnly) { try { state FileBackupAgent backupAgent; - - // Backup everything, if no ranges were specified - if (backupRanges.size() == 0) { - backupRanges.push_back_deep(backupRanges.arena(), normalKeys); - } + ASSERT(!backupRanges.empty()); if (dryRun) { state KeyBackedTag tag = makeBackupTag(tagName); @@ -1963,6 +1958,7 @@ ACTOR Future submitBackup(Database db, snapshotIntervalSeconds, tagName, backupRanges, + encryptionEnabled, stopWhenDone, usePartitionedLog, incrementalBackupOnly)); @@ -2016,11 +2012,7 @@ ACTOR Future switchDBBackup(Database src, ForceAction forceAction) { try { state DatabaseBackupAgent backupAgent(src); - - // Backup everything, if no ranges were specified - if (backupRanges.size() == 0) { - backupRanges.push_back_deep(backupRanges.arena(), normalKeys); - } + ASSERT(!backupRanges.empty()); wait(backupAgent.atomicSwitchover(dest, KeyRef(tagName), backupRanges, StringRef(), StringRef(), forceAction)); printf("The DR on tag `%s' was successfully switched.\n", printable(StringRef(tagName)).c_str()); @@ -2287,9 +2279,7 @@ ACTOR Future runRestore(Database db, OnlyApplyMutationLogs onlyApplyMutationLogs, InconsistentSnapshotOnly inconsistentSnapshotOnly, Optional encryptionKeyFile) { - if (ranges.empty()) { - ranges.push_back_deep(ranges.arena(), normalKeys); - } + ASSERT(!ranges.empty()); if (targetVersion != invalidVersion && !targetTimestamp.empty()) { fprintf(stderr, "Restore target version and target timestamp cannot both be specified\n"); @@ -2313,7 +2303,7 @@ ACTOR Future runRestore(Database db, throw restore_error(); } - origDb = Database::createDatabase(originalClusterFile, Database::API_VERSION_LATEST); + origDb = Database::createDatabase(originalClusterFile, ApiVersion::LATEST_VERSION); Version v = wait(timeKeeperVersionFromDatetime(targetTimestamp, origDb.get())); fmt::print("Timestamp '{0}' resolves to version {1}\n", targetTimestamp, v); targetVersion = v; @@ -2370,7 +2360,7 @@ ACTOR Future runRestore(Database db, fmt::print("Restored to version {}\n", restoredVersion); } } else { - state Optional rset = wait(bc->getRestoreSet(targetVersion, ranges)); + state Optional rset = wait(bc->getRestoreSet(targetVersion, db, ranges)); if (!rset.present()) { fmt::print(stderr, @@ -2447,8 +2437,8 @@ ACTOR Future runFastRestoreTool(Database db, dbVersion, LockDB::True, randomUID, - LiteralStringRef(""), - LiteralStringRef(""))); + ""_sr, + ""_sr)); // TODO: Support addPrefix and removePrefix if (waitForDone) { // Wait for parallel restore to finish and unlock DB after that @@ -2480,7 +2470,7 @@ ACTOR Future runFastRestoreTool(Database db, restoreVersion = dbVersion; } - state Optional rset = wait(bc->getRestoreSet(restoreVersion)); + state Optional rset = wait(bc->getRestoreSet(restoreVersion, db)); if (!rset.present()) { fmt::print(stderr, "Insufficient data to restore to version {}\n", restoreVersion); throw restore_invalid_version(); @@ -2685,7 +2675,8 @@ ACTOR Future queryBackup(const char* name, Version restoreVersion, std::string originalClusterFile, std::string restoreTimestamp, - Verbose verbose) { + Verbose verbose, + Optional cx) { state UID operationId = deterministicRandom()->randomUniqueID(); state JsonBuilderObject result; state std::string errorMessage; @@ -2719,7 +2710,7 @@ ACTOR Future queryBackup(const char* name, return Void(); } - Database origDb = Database::createDatabase(originalClusterFile, Database::API_VERSION_LATEST); + Database origDb = Database::createDatabase(originalClusterFile, ApiVersion::LATEST_VERSION); Version v = wait(timeKeeperVersionFromDatetime(restoreTimestamp, origDb)); result["restore_timestamp"] = restoreTimestamp; result["restore_timestamp_resolved_version"] = v; @@ -2750,7 +2741,7 @@ ACTOR Future queryBackup(const char* name, format("the specified restorable version %lld is not valid", restoreVersion)); return Void(); } - Optional fileSet = wait(bc->getRestoreSet(restoreVersion, keyRangesFilter)); + Optional fileSet = wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter)); if (fileSet.present()) { int64_t totalRangeFilesSize = 0, totalLogFilesSize = 0; result["restore_version"] = fileSet.get().targetVersion; @@ -3087,7 +3078,7 @@ static void addKeyRange(std::string optionValue, Standalone connectToCluster(std::string const& clusterFile, } try { - db = Database::createDatabase(ccf, -1, IsInternal::True, localities); + db = Database::createDatabase(ccf, ApiVersion::LATEST_VERSION, IsInternal::True, localities); } catch (Error& e) { if (!quiet) { fprintf(stderr, "ERROR: %s\n", e.what()); @@ -3376,6 +3367,8 @@ int main(int argc, char* argv[]) { bool trace = false; bool quietDisplay = false; bool dryRun = false; + // TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start + bool encryptionEnabled = true; std::string traceDir = ""; std::string traceFormat = ""; std::string traceLogGroup; @@ -3606,7 +3599,7 @@ int main(int argc, char* argv[]) { case OPT_DESTCONTAINER: destinationContainer = args->OptionArg(); // If the url starts with '/' then prepend "file://" for backwards compatibility - if (StringRef(destinationContainer).startsWith(LiteralStringRef("/"))) + if (StringRef(destinationContainer).startsWith("/"_sr)) destinationContainer = std::string("file://") + destinationContainer; modifyOptions.destURL = destinationContainer; break; @@ -3652,7 +3645,7 @@ int main(int argc, char* argv[]) { case OPT_RESTORECONTAINER: restoreContainer = args->OptionArg(); // If the url starts with '/' then prepend "file://" for backwards compatibility - if (StringRef(restoreContainer).startsWith(LiteralStringRef("/"))) + if (StringRef(restoreContainer).startsWith("/"_sr)) restoreContainer = std::string("file://") + restoreContainer; break; case OPT_DESCRIBE_DEEP: @@ -3943,6 +3936,12 @@ int main(int argc, char* argv[]) { return result.present(); }; + // The fastrestore tool does not yet support multiple ranges and is incompatible with tenants + // or other features that back up data in the system keys + if (backupKeys.empty() && programExe != ProgramExe::FASTRESTORE_TOOL) { + addDefaultBackupRanges(backupKeys); + } + switch (programExe) { case ProgramExe::AGENT: if (!initCluster()) @@ -3962,6 +3961,7 @@ int main(int argc, char* argv[]) { initialSnapshotIntervalSeconds, snapshotIntervalSeconds, backupKeys, + encryptionEnabled, tagName, dryRun, waitForDone, @@ -4082,7 +4082,8 @@ int main(int argc, char* argv[]) { restoreVersion, restoreClusterFileOrig, restoreTimestamp, - Verbose{ !quietDisplay })); + Verbose{ !quietDisplay }, + db)); break; case BackupType::DUMP: @@ -4122,7 +4123,7 @@ int main(int argc, char* argv[]) { } try { - db = Database::createDatabase(restoreClusterFileDest, Database::API_VERSION_LATEST); + db = Database::createDatabase(restoreClusterFileDest, ApiVersion::LATEST_VERSION); } catch (Error& e) { fprintf(stderr, "Restore destination cluster file '%s' invalid: %s\n", @@ -4201,7 +4202,7 @@ int main(int argc, char* argv[]) { } try { - db = Database::createDatabase(restoreClusterFileDest, Database::API_VERSION_LATEST); + db = Database::createDatabase(restoreClusterFileDest, ApiVersion::LATEST_VERSION); } catch (Error& e) { fprintf(stderr, "Restore destination cluster file '%s' invalid: %s\n", @@ -4321,19 +4322,19 @@ int main(int argc, char* argv[]) { char* demangled = abi::__cxa_demangle(i->first, NULL, NULL, NULL); if (demangled) { s = demangled; - if (StringRef(s).startsWith(LiteralStringRef("(anonymous namespace)::"))) - s = s.substr(LiteralStringRef("(anonymous namespace)::").size()); + if (StringRef(s).startsWith("(anonymous namespace)::"_sr)) + s = s.substr("(anonymous namespace)::"_sr.size()); free(demangled); } else s = i->first; #else s = i->first; - if (StringRef(s).startsWith(LiteralStringRef("class `anonymous namespace'::"))) - s = s.substr(LiteralStringRef("class `anonymous namespace'::").size()); - else if (StringRef(s).startsWith(LiteralStringRef("class "))) - s = s.substr(LiteralStringRef("class ").size()); - else if (StringRef(s).startsWith(LiteralStringRef("struct "))) - s = s.substr(LiteralStringRef("struct ").size()); + if (StringRef(s).startsWith("class `anonymous namespace'::"_sr)) + s = s.substr("class `anonymous namespace'::"_sr.size()); + else if (StringRef(s).startsWith("class "_sr)) + s = s.substr("class "_sr.size()); + else if (StringRef(s).startsWith("struct "_sr)) + s = s.substr("struct "_sr.size()); #endif typeNames.emplace_back(s, i->first); diff --git a/fdbcli/AdvanceVersionCommand.actor.cpp b/fdbcli/AdvanceVersionCommand.actor.cpp index 223af2d8e5..d3ba08d675 100644 --- a/fdbcli/AdvanceVersionCommand.actor.cpp +++ b/fdbcli/AdvanceVersionCommand.actor.cpp @@ -31,7 +31,7 @@ namespace fdb_cli { -const KeyRef advanceVersionSpecialKey = LiteralStringRef("\xff\xff/management/min_required_commit_version"); +const KeyRef advanceVersionSpecialKey = "\xff\xff/management/min_required_commit_version"_sr; ACTOR Future advanceVersionCommandActor(Reference db, std::vector tokens) { if (tokens.size() != 2) { diff --git a/fdbcli/BlobKeyCommand.actor.cpp b/fdbcli/BlobKeyCommand.actor.cpp new file mode 100644 index 0000000000..34d5b98720 --- /dev/null +++ b/fdbcli/BlobKeyCommand.actor.cpp @@ -0,0 +1,188 @@ +/* + * BlobKeyCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace { + +ACTOR Future printBlobHistory(Database db, Key key, Optional version) { + fmt::print("Printing blob history for {0}", key.printable()); + if (version.present()) { + fmt::print(" @ {0}", version.get()); + } + fmt::print("\n"); + + state Transaction tr(db); + state KeyRange activeGranule; + state KeyRange queryRange(KeyRangeRef(key, keyAfter(key))); + loop { + try { + Standalone> granules = wait(tr.getBlobGranuleRanges(queryRange, 2)); + if (granules.empty()) { + fmt::print("No active granule for {0}\n", key.printable()); + return false; + } + ASSERT(granules.size() == 1); + activeGranule = granules[0]; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + fmt::print("Active granule: [{0} - {1})\n", activeGranule.begin.printable(), activeGranule.end.printable()); + + // get latest history entry for range + state GranuleHistory history; + loop { + try { + RangeResult result = + wait(tr.getRange(blobGranuleHistoryKeyRangeFor(activeGranule), 1, Snapshot::False, Reverse::True)); + ASSERT(result.size() <= 1); + + if (result.empty()) { + fmt::print("No history entry found\n"); + return true; + } + + std::pair decodedKey = decodeBlobGranuleHistoryKey(result[0].key); + ASSERT(activeGranule == decodedKey.first); + history = GranuleHistory(activeGranule, decodedKey.second, decodeBlobGranuleHistoryValue(result[0].value)); + + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + fmt::print("History:\n\n"); + loop { + // print history + std::string boundaryChangeAction; + if (history.value.parentVersions.empty()) { + boundaryChangeAction = "root"; + } else if (history.value.parentVersions.size() == 1) { + boundaryChangeAction = "split"; + } else { + boundaryChangeAction = "merge"; + } + fmt::print("{0}) {1}\n\t{2}\n\t{3}\n({4})\n\n", + history.version, + history.value.granuleID.toString(), + history.range.begin.printable(), + history.range.end.printable(), + boundaryChangeAction); + // traverse back + + if (history.value.parentVersions.empty() || (version.present() && history.version <= version.get())) { + break; + } + + int i; + for (i = 0; i < history.value.parentBoundaries.size(); i++) { + if (history.value.parentBoundaries[i] <= key) { + break; + } + } + // key should fall between boundaries + ASSERT(i < history.value.parentBoundaries.size()); + KeyRangeRef parentRange(history.value.parentBoundaries[i], history.value.parentBoundaries[i + 1]); + Version parentVersion = history.value.parentVersions[i]; + state Key parentHistoryKey = blobGranuleHistoryKeyFor(parentRange, parentVersion); + state bool foundParent; + + loop { + try { + Optional parentHistoryValue = wait(tr.get(parentHistoryKey)); + foundParent = parentHistoryValue.present(); + if (foundParent) { + std::pair decodedKey = decodeBlobGranuleHistoryKey(parentHistoryKey); + history = GranuleHistory( + decodedKey.first, decodedKey.second, decodeBlobGranuleHistoryValue(parentHistoryValue.get())); + } + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + if (!foundParent) { + break; + } + } + + fmt::print("Done\n"); + return true; +} + +} // namespace + +namespace fdb_cli { + +ACTOR Future blobKeyCommandActor(Database localDb, + Optional tenantEntry, + std::vector tokens) { + // enables blob writing for the given range + if (tokens.size() != 3 && tokens.size() != 4) { + printUsage(tokens[0]); + return false; + } + + ASSERT(tokens[1] == "history"_sr); + + Key key; + Optional version; + + if (tenantEntry.present()) { + key = tokens[2].withPrefix(tenantEntry.get().prefix); + } else { + key = tokens[2]; + } + + if (tokens.size() > 3) { + Version v; + int n = 0; + if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[3].size()) { + printUsage(tokens[0]); + return false; + } + version = v; + } + + if (key >= "\xff"_sr) { + fmt::print("No blob history for system keyspace\n", key.printable()); + return false; + } else { + bool result = wait(printBlobHistory(localDb, key, version)); + return result; + } +} + +// can extend to other blobkey commands later +CommandFactory blobKeyFactory("blobkey", CommandHelp("blobkey history [version]", "", "")); +} // namespace fdb_cli diff --git a/fdbcli/BlobRangeCommand.actor.cpp b/fdbcli/BlobRangeCommand.actor.cpp index 02b922c4a8..38edaa8568 100644 --- a/fdbcli/BlobRangeCommand.actor.cpp +++ b/fdbcli/BlobRangeCommand.actor.cpp @@ -23,6 +23,7 @@ #include "fdbclient/FDBOptions.g.h" #include "fdbclient/IClientApi.h" #include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" #include "flow/Arena.h" #include "flow/FastRef.h" @@ -31,33 +32,62 @@ namespace { -// copy to standalones for krm -ACTOR Future setBlobRange(Database db, Key startKey, Key endKey, Value value) { - state Reference tr = makeReference(db); - +ACTOR Future getLatestReadVersion(Database db) { + state Transaction tr(db); loop { try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - - // FIXME: check that the set range is currently inactive, and that a revoked range is currently its own - // range in the map and fully set. - - tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString()); - // This is not coalescing because we want to keep each range logically separate. - wait(krmSetRange(tr, blobRangeKeys.begin, KeyRange(KeyRangeRef(startKey, endKey)), value)); - wait(tr->commit()); - printf("Successfully updated blob range [%s - %s) to %s\n", - startKey.printable().c_str(), - endKey.printable().c_str(), - value.printable().c_str()); - return Void(); + Version rv = wait(tr.getReadVersion()); + fmt::print("Resolved latest read version as {0}\n", rv); + return rv; } catch (Error& e) { - wait(tr->onError(e)); + wait(tr.onError(e)); } } } +// print after delay if not cancelled +ACTOR Future printAfterDelay(double delaySeconds, std::string message) { + wait(delay(delaySeconds)); + fmt::print("{}\n", message); + return Void(); +} + +ACTOR Future doBlobPurge(Database db, Key startKey, Key endKey, Optional version, bool force) { + state Version purgeVersion; + if (version.present()) { + purgeVersion = version.get(); + } else { + wait(store(purgeVersion, getLatestReadVersion(db))); + } + + state Key purgeKey = wait(db->purgeBlobGranules(KeyRange(KeyRangeRef(startKey, endKey)), purgeVersion, {}, force)); + + fmt::print("Blob purge registered for [{0} - {1}) @ {2}\n", startKey.printable(), endKey.printable(), purgeVersion); + + state Future printWarningActor = printAfterDelay( + 5.0, "Waiting for purge to complete. (interrupting this wait with CTRL+C will not cancel the purge)"); + wait(db->waitPurgeGranulesComplete(purgeKey)); + + fmt::print("Blob purge complete for [{0} - {1}) @ {2}\n", startKey.printable(), endKey.printable(), purgeVersion); + + return Void(); +} + +ACTOR Future doBlobCheck(Database db, Key startKey, Key endKey, Optional version) { + state double elapsed = -timer_monotonic(); + + state Version readVersionOut = wait(db->verifyBlobRange(KeyRangeRef(startKey, endKey), version)); + + elapsed += timer_monotonic(); + + fmt::print("Blob check complete for [{0} - {1}) @ {2} in {3:.6f} seconds\n", + startKey.printable(), + endKey.printable(), + readVersionOut, + elapsed); + return Void(); +} + } // namespace namespace fdb_cli { @@ -66,7 +96,7 @@ ACTOR Future blobRangeCommandActor(Database localDb, Optional tenantEntry, std::vector tokens) { // enables blob writing for the given range - if (tokens.size() != 4) { + if (tokens.size() != 4 && tokens.size() != 5) { printUsage(tokens[0]); return false; } @@ -82,31 +112,80 @@ ACTOR Future blobRangeCommandActor(Database localDb, end = tokens[3]; } - if (end > LiteralStringRef("\xff")) { + if (end > "\xff"_sr) { // TODO is this something we want? - printf("Cannot blobbify system keyspace! Problematic End Key: %s\n", tokens[3].printable().c_str()); + fmt::print("Cannot blobbify system keyspace! Problematic End Key: {0}\n", tokens[3].printable()); return false; } else if (tokens[2] >= tokens[3]) { - printf("Invalid blob range [%s - %s)\n", tokens[2].printable().c_str(), tokens[3].printable().c_str()); + fmt::print("Invalid blob range [{0} - {1})\n", tokens[2].printable(), tokens[3].printable()); } else { - if (tokencmp(tokens[1], "start")) { - printf("Starting blobbify range for [%s - %s)\n", - tokens[2].printable().c_str(), - tokens[3].printable().c_str()); - wait(setBlobRange(localDb, begin, end, LiteralStringRef("1"))); - } else if (tokencmp(tokens[1], "stop")) { - printf("Stopping blobbify range for [%s - %s)\n", - tokens[2].printable().c_str(), - tokens[3].printable().c_str()); - wait(setBlobRange(localDb, begin, end, StringRef())); + if (tokencmp(tokens[1], "start") || tokencmp(tokens[1], "stop")) { + state bool starting = tokencmp(tokens[1], "start"); + if (tokens.size() > 4) { + printUsage(tokens[0]); + return false; + } + fmt::print("{0} blobbify range for [{1} - {2})\n", + starting ? "Starting" : "Stopping", + tokens[2].printable(), + tokens[3].printable()); + state bool success = false; + if (starting) { + wait(store(success, localDb->blobbifyRange(KeyRangeRef(begin, end)))); + } else { + wait(store(success, localDb->unblobbifyRange(KeyRangeRef(begin, end)))); + } + if (success) { + fmt::print("{0} updated blob range [{1} - {2}) succeeded\n", + starting ? "Starting" : "Stopping", + tokens[2].printable(), + tokens[3].printable()); + } else { + fmt::print("{0} blobbify range for [{1} - {2}) failed\n", + starting ? "Starting" : "Stopping", + tokens[2].printable(), + tokens[3].printable()); + } + return success; + } else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) { + bool purge = tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge"); + bool forcePurge = tokencmp(tokens[1], "forcepurge"); + + Optional version; + if (tokens.size() > 4) { + Version v; + int n = 0; + if (sscanf(tokens[4].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[4].size()) { + printUsage(tokens[0]); + return false; + } + version = v; + } + + fmt::print("{0} blob range [{1} - {2}){3}", + purge ? "Purging" : "Checking", + tokens[2].printable(), + tokens[3].printable(), + forcePurge ? " (force)" : ""); + if (version.present()) { + fmt::print(" @ {0}", version.get()); + } + fmt::print("\n"); + + if (purge) { + wait(doBlobPurge(localDb, begin, end, version, forcePurge)); + } else { + wait(doBlobCheck(localDb, begin, end, version)); + } } else { printUsage(tokens[0]); - printf("Usage: blobrange "); return false; } } return true; } -CommandFactory blobRangeFactory("blobrange", CommandHelp("blobrange ", "", "")); +CommandFactory blobRangeFactory( + "blobrange", + CommandHelp("blobrange [version]", "", "")); } // namespace fdb_cli diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt index aff53b1f63..c25c335a15 100644 --- a/fdbcli/CMakeLists.txt +++ b/fdbcli/CMakeLists.txt @@ -1,8 +1,16 @@ +include(AddFdbTest) fdb_find_sources(FDBCLI_SRCS) add_flow_target(EXECUTABLE NAME fdbcli SRCS ${FDBCLI_SRCS}) target_include_directories(fdbcli PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include") target_link_libraries(fdbcli PRIVATE fdbclient SimpleOpt) +if (USE_UBSAN) + # The intent is to put typeinfo symbols in the dynamic symbol table so that + # the types in fdbcli and external libfdb_c clients agree for ubsan's vptr + # check. This would not be a good idea for the normal build, or if we ever + # start testing old libfdb_c's that are ubsan-instrumented. + target_link_options(fdbcli PRIVATE "-rdynamic") +endif() if(NOT WIN32) target_link_libraries(fdbcli PRIVATE linenoise) @@ -16,3 +24,38 @@ if(NOT OPEN_FOR_IDE) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbcli DESTINATION bin COMPONENT clients) endif() endif() + +if (NOT WIN32 AND NOT OPEN_FOR_IDE) + add_dependencies(fdbcli external_client) + + add_fdbclient_test( + NAME single_process_fdbcli_tests + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + ) + add_fdbclient_test( + NAME multi_process_fdbcli_tests + PROCESS_NUMBER 5 + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + 5 + ) + add_fdbclient_test( + NAME single_process_external_client_fdbcli_tests + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so + ) + add_fdbclient_test( + NAME multi_process_external_client_fdbcli_tests + PROCESS_NUMBER 5 + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + 5 + --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so + ) +endif() diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp index 37474242e1..26a3da9876 100644 --- a/fdbcli/ConfigureCommand.actor.cpp +++ b/fdbcli/ConfigureCommand.actor.cpp @@ -44,20 +44,20 @@ ACTOR Future configureCommandActor(Reference db, if (tokens.size() < 2) result = ConfigurationResult::NO_OPTIONS_PROVIDED; else { - if (tokens[startToken] == LiteralStringRef("FORCE")) { + if (tokens[startToken] == "FORCE"_sr) { force = true; startToken = 2; } state Optional conf; - if (tokens[startToken] == LiteralStringRef("auto")) { + if (tokens[startToken] == "auto"_sr) { // get cluster status state Reference tr = db->createTransaction(); if (!tr->isValid()) { StatusObject _s = wait(StatusClient::statusFetcher(localDb)); s = _s; } else { - state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + state ThreadFuture> statusValueF = tr->get("\xff\xff/status/json"_sr); Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); if (!statusValue.present()) { fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); @@ -166,7 +166,7 @@ ACTOR Future configureCommandActor(Reference db, case ConfigurationResult::CONFLICTING_OPTIONS: case ConfigurationResult::UNKNOWN_OPTION: case ConfigurationResult::INCOMPLETE_CONFIGURATION: - printUsage(LiteralStringRef("configure")); + printUsage("configure"_sr); ret = false; break; case ConfigurationResult::INVALID_CONFIGURATION: @@ -259,7 +259,6 @@ ACTOR Future configureCommandActor(Reference db, fprintf(stderr, "Type `configure perpetual_storage_wiggle=1' to enable the perpetual wiggle, or `configure " "storage_migration_type=gradual' to set the gradual migration type.\n"); - ret = false; break; case ConfigurationResult::SUCCESS_WARN_ROCKSDB_EXPERIMENTAL: printf("Configuration changed\n"); @@ -272,6 +271,14 @@ ACTOR Future configureCommandActor(Reference db, stderr, "WARN: Sharded RocksDB storage engine type is still in experimental stage, not yet production tested.\n"); break; + case ConfigurationResult::DATABASE_IS_REGISTERED: + fprintf(stderr, "ERROR: A cluster cannot change its tenant mode while part of a metacluster.\n"); + ret = false; + break; + case ConfigurationResult::ENCRYPTION_AT_REST_MODE_ALREADY_SET: + fprintf(stderr, "ERROR: A cluster cannot change its encryption_at_rest state after database creation.\n"); + ret = false; + break; default: ASSERT(false); ret = false; @@ -305,6 +312,7 @@ void configureGenerator(const char* text, "storage_migration_type=", "tenant_mode=", "blob_granules_enabled=", + "encryption_at_rest_mode=", nullptr }; arrayGenerator(text, line, opts, lc); } @@ -317,7 +325,8 @@ CommandFactory configureFactory( "commit_proxies=|grv_proxies=|logs=|resolvers=>*|" "count=|perpetual_storage_wiggle=|perpetual_storage_wiggle_locality=" "<:|0>|storage_migration_type={disabled|gradual|aggressive}" - "|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}", + "|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}" + "|encryption_at_rest_mode={disabled|aes_256_ctr}", "change the database configuration", "The `new' option, if present, initializes a new database with the given configuration rather than changing " "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be " @@ -351,6 +360,9 @@ CommandFactory configureFactory( "tenant_mode=: Sets the tenant mode for the cluster. If " "optional, then transactions can be run with or without specifying tenants. If required, all data must be " "accessed using tenants.\n\n" + "encryption_at_rest_mode=: Sets the cluster encryption data at-rest support for the " + "database. The configuration can be updated ONLY at the time of database creation and once set can't be " + "updated for the lifetime of the database.\n\n" "See the FoundationDB Administration Guide for more information."), &configureGenerator); diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp index 2e14e71fcc..1f225d1dfe 100644 --- a/fdbcli/ConsistencyCheckCommand.actor.cpp +++ b/fdbcli/ConsistencyCheckCommand.actor.cpp @@ -30,7 +30,7 @@ namespace fdb_cli { -const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended"); +const KeyRef consistencyCheckSpecialKey = "\xff\xff/management/consistency_check_suspended"_sr; ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens, diff --git a/fdbcli/ConsistencyScanCommand.actor.cpp b/fdbcli/ConsistencyScanCommand.actor.cpp new file mode 100644 index 0000000000..532e43119a --- /dev/null +++ b/fdbcli/ConsistencyScanCommand.actor.cpp @@ -0,0 +1,122 @@ +/* + * ConsistencyScanCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "fdbclient/ConsistencyScanInterface.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace fdb_cli { + +ACTOR Future consistencyScanCommandActor(Database db, std::vector tokens) { + state Reference tr = makeReference(db); + // Here we do not proceed in a try-catch loop since the transaction is always supposed to succeed. + // If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message + state int usageError = 0; + state ConsistencyScanInfo csInfo = ConsistencyScanInfo(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + // Get the exisiting consistencyScanInfo object if present + state Optional consistencyScanInfo = wait(ConsistencyScanInfo::getInfo(tr)); + wait(tr->commit()); + if (consistencyScanInfo.present()) + csInfo = ObjectReader::fromStringRef(consistencyScanInfo.get(), IncludeVersion()); + tr->reset(); + + if (tokens.size() == 1) { + printf("Consistency Scan Info: %s\n", csInfo.toString().c_str()); + } else if ((tokens.size() == 2) && tokencmp(tokens[1], "off")) { + csInfo.consistency_scan_enabled = false; + wait(ConsistencyScanInfo::setInfo(tr, csInfo)); + wait(tr->commit()); + } else if ((tokencmp(tokens[1], "on") && tokens.size() > 2)) { + csInfo.consistency_scan_enabled = true; + state std::vector::iterator t; + for (t = tokens.begin() + 2; t != tokens.end(); ++t) { + if (tokencmp(t->toString(), "restart")) { + if (++t != tokens.end()) { + if (tokencmp(t->toString(), "0")) { + csInfo.restart = false; + } else if (tokencmp(t->toString(), "1")) { + csInfo.restart = true; + } else { + usageError = 1; + } + } else { + usageError = 1; + } + } else if (tokencmp(t->toString(), "maxRate")) { + if (++t != tokens.end()) { + char* end; + csInfo.max_rate = std::strtod(t->toString().data(), &end); + if (!std::isspace(*end) && (*end != '\0')) { + fprintf(stderr, "ERROR: %s failed to parse.\n", t->toString().c_str()); + return false; + } + } else { + usageError = 1; + } + } else if (tokencmp(t->toString(), "targetInterval")) { + if (++t != tokens.end()) { + char* end; + csInfo.target_interval = std::strtod(t->toString().data(), &end); + if (!std::isspace(*end) && (*end != '\0')) { + fprintf(stderr, "ERROR: %s failed to parse.\n", t->toString().c_str()); + return false; + } + } else { + usageError = 1; + } + } else { + usageError = 1; + } + } + + if (!usageError) { + wait(ConsistencyScanInfo::setInfo(tr, csInfo)); + wait(tr->commit()); + } + } else { + usageError = 1; + } + + if (usageError) { + printUsage(tokens[0]); + return false; + } + return true; +} + +CommandFactory consistencyScanFactory( + "consistencyscan", + CommandHelp("consistencyscan ", + "enables or disables consistency scan", + "Calling this command with `on' enables the consistency scan process to run the scan with given " + "arguments and `off' will halt the scan. " + "Calling this command with no arguments will display if consistency scan is currently enabled.\n")); + +} // namespace fdb_cli \ No newline at end of file diff --git a/fdbcli/CoordinatorsCommand.actor.cpp b/fdbcli/CoordinatorsCommand.actor.cpp index b68d5ab3d3..4680c5393a 100644 --- a/fdbcli/CoordinatorsCommand.actor.cpp +++ b/fdbcli/CoordinatorsCommand.actor.cpp @@ -64,17 +64,26 @@ ACTOR Future changeCoordinators(Reference db, std::vectorstartsWith(nameTokenBegin)) { + if (tok->startsWith(nameTokenBegin) && new_cluster_description.empty()) { new_cluster_description = tok->substr(nameTokenBegin.size()); + auto next = tok - 1; std::copy(tok + 1, tokens.end(), tok); tokens.resize(tokens.size() - 1); - break; + tok = next; + } else if (tok->startsWith(noConfigDB)) { + disableConfigDB = true; + auto next = tok - 1; + std::copy(tok + 1, tokens.end(), tok); + tokens.resize(tokens.size() - 1); + tok = next; } } - state bool automatic = tokens.size() == 2 && tokens[1] == LiteralStringRef("auto"); + state bool automatic = tokens.size() == 2 && tokens[1] == "auto"_sr; state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); @@ -83,6 +92,10 @@ ACTOR Future changeCoordinators(Reference db, std::vectorset(fdb_cli::clusterDescriptionSpecialKey, new_cluster_description); } + if (disableConfigDB) { + // All that matters is the key is set. + tr->set(fdb_cli::configDBSpecialKey, ""_sr); + } // if auto change, read the special key to retrieve the recommended config if (automatic) { // if previous read failed, retry, otherwise, use the same recommened config @@ -173,9 +186,10 @@ ACTOR Future changeCoordinators(Reference db, std::vector coordinatorsCommandActor(Reference db, std::vector tokens) { if (tokens.size() < 2) { diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index 7000bdf5c7..8b7690b009 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -108,8 +108,8 @@ Future setDDIgnoreRebalanceOff(Reference db, uint8_t DDIgnoreOp namespace fdb_cli { -const KeyRef ddModeSpecialKey = LiteralStringRef("\xff\xff/management/data_distribution/mode"); -const KeyRef ddIgnoreRebalanceSpecialKey = LiteralStringRef("\xff\xff/management/data_distribution/rebalance_ignored"); +const KeyRef ddModeSpecialKey = "\xff\xff/management/data_distribution/mode"_sr; +const KeyRef ddIgnoreRebalanceSpecialKey = "\xff\xff/management/data_distribution/rebalance_ignored"_sr; constexpr auto usage = "Usage: datadistribution |enable " ">\n"; @@ -127,7 +127,7 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve printf("Data distribution is turned off.\n"); } else if (tokencmp(tokens[1], "disable")) { if (tokencmp(tokens[2], "ssfailure")) { - wait(success((setHealthyZone(db, LiteralStringRef("IgnoreSSFailures"), 0)))); + wait(success((setHealthyZone(db, "IgnoreSSFailures"_sr, 0)))); printf("Data distribution is disabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { wait(setDDIgnoreRebalanceOn(db, DDIgnore::REBALANCE_DISK | DDIgnore::REBALANCE_READ)); diff --git a/fdbcli/ExcludeCommand.actor.cpp b/fdbcli/ExcludeCommand.actor.cpp index db67bd8a6e..7c8b7217e0 100644 --- a/fdbcli/ExcludeCommand.actor.cpp +++ b/fdbcli/ExcludeCommand.actor.cpp @@ -227,22 +227,19 @@ ACTOR Future checkForCoordinators(Reference db, std::vector excludeCommandActor(Reference db, std::vector tokens, Future warn) { if (tokens.size() <= 1) { @@ -281,11 +278,11 @@ ACTOR Future excludeCommandActor(Reference db, std::vectorstartsWith(LocalityData::ExcludeLocalityPrefix) && t->toString().find(':') != std::string::npos) { diff --git a/fdbcli/ExpensiveDataCheckCommand.actor.cpp b/fdbcli/ExpensiveDataCheckCommand.actor.cpp index e9d5c5b989..3be572d3d1 100644 --- a/fdbcli/ExpensiveDataCheckCommand.actor.cpp +++ b/fdbcli/ExpensiveDataCheckCommand.actor.cpp @@ -46,7 +46,7 @@ ACTOR Future expensiveDataCheckCommandActor( if (tokens.size() == 1) { // initialize worker interfaces address_interface->clear(); - wait(getWorkerInterfaces(tr, address_interface)); + wait(getWorkerInterfaces(tr, address_interface, true)); } if (tokens.size() == 1 || tokencmp(tokens[1], "list")) { if (address_interface->size() == 0) { diff --git a/fdbcli/FileConfigureCommand.actor.cpp b/fdbcli/FileConfigureCommand.actor.cpp index e35114c429..8cce2ec543 100644 --- a/fdbcli/FileConfigureCommand.actor.cpp +++ b/fdbcli/FileConfigureCommand.actor.cpp @@ -78,7 +78,7 @@ ACTOR Future fileConfigureCommandActor(Reference db, name + "=" + json_spirit::write_string(json_spirit::mValue(value.get_array()), json_spirit::Output_options::none); } else { - printUsage(LiteralStringRef("fileconfigure")); + printUsage("fileconfigure"_sr); return false; } } diff --git a/fdbcli/IncludeCommand.actor.cpp b/fdbcli/IncludeCommand.actor.cpp index a463772960..be55ac8476 100644 --- a/fdbcli/IncludeCommand.actor.cpp +++ b/fdbcli/IncludeCommand.actor.cpp @@ -92,8 +92,7 @@ ACTOR Future includeServers(Reference db, std::vectorclear(KeyRangeRef(addr.withSuffix(LiteralStringRef(":")), - addr.withSuffix(LiteralStringRef(";")))); + tr->clear(KeyRangeRef(addr.withSuffix(":"_sr), addr.withSuffix(";"_sr))); } } wait(safeThreadFutureToFuture(tr->commit())); @@ -112,9 +111,9 @@ ACTOR Future include(Reference db, std::vector token state bool failed = false; state bool all = false; for (auto t = tokens.begin() + 1; t != tokens.end(); ++t) { - if (*t == LiteralStringRef("all")) { + if (*t == "all"_sr) { all = true; - } else if (*t == LiteralStringRef("failed")) { + } else if (*t == "failed"_sr) { failed = true; } else if (t->startsWith(LocalityData::ExcludeLocalityPrefix) && t->toString().find(':') != std::string::npos) { // if the token starts with 'locality_' prefix. diff --git a/fdbcli/KillCommand.actor.cpp b/fdbcli/KillCommand.actor.cpp index 391bdb3064..c8fa75bb1c 100644 --- a/fdbcli/KillCommand.actor.cpp +++ b/fdbcli/KillCommand.actor.cpp @@ -29,6 +29,7 @@ #include "flow/Arena.h" #include "flow/FastRef.h" #include "flow/ThreadHelper.actor.h" +#include "flow/CodeProbe.h" #include "flow/actorcompiler.h" // This must be the last #include. namespace fdb_cli { @@ -43,7 +44,7 @@ ACTOR Future killCommandActor(Reference db, if (tokens.size() == 1) { // initialize worker interfaces address_interface->clear(); - wait(getWorkerInterfaces(tr, address_interface)); + wait(getWorkerInterfaces(tr, address_interface, true)); } if (tokens.size() == 1 || tokencmp(tokens[1], "list")) { if (address_interface->size() == 0) { diff --git a/fdbcli/LockCommand.actor.cpp b/fdbcli/LockCommand.actor.cpp index 1ed988ee34..a2ac2c05cd 100644 --- a/fdbcli/LockCommand.actor.cpp +++ b/fdbcli/LockCommand.actor.cpp @@ -59,7 +59,7 @@ ACTOR Future lockDatabase(Reference db, UID id) { namespace fdb_cli { -const KeyRef lockSpecialKey = LiteralStringRef("\xff\xff/management/db_locked"); +const KeyRef lockSpecialKey = "\xff\xff/management/db_locked"_sr; ACTOR Future lockCommandActor(Reference db, std::vector tokens) { if (tokens.size() != 1) { diff --git a/fdbcli/MaintenanceCommand.actor.cpp b/fdbcli/MaintenanceCommand.actor.cpp index 487490e09f..b6dd8cc139 100644 --- a/fdbcli/MaintenanceCommand.actor.cpp +++ b/fdbcli/MaintenanceCommand.actor.cpp @@ -69,10 +69,10 @@ ACTOR Future printHealthyZone(Reference db) { namespace fdb_cli { -const KeyRangeRef maintenanceSpecialKeyRange = KeyRangeRef(LiteralStringRef("\xff\xff/management/maintenance/"), - LiteralStringRef("\xff\xff/management/maintenance0")); +const KeyRangeRef maintenanceSpecialKeyRange = + KeyRangeRef("\xff\xff/management/maintenance/"_sr, "\xff\xff/management/maintenance0"_sr); // The special key, if present, means data distribution is disabled for storage failures; -const KeyRef ignoreSSFailureSpecialKey = LiteralStringRef("\xff\xff/management/maintenance/IgnoreSSFailures"); +const KeyRef ignoreSSFailureSpecialKey = "\xff\xff/management/maintenance/IgnoreSSFailures"_sr; // add a zone to maintenance and specify the maintenance duration ACTOR Future setHealthyZone(Reference db, StringRef zoneId, double seconds, bool printWarning) { diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp new file mode 100644 index 0000000000..da7c0f79fd --- /dev/null +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -0,0 +1,432 @@ +/* + * MetaclusterCommands.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/Schemas.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace fdb_cli { + +Optional, Optional>> +parseClusterConfiguration(std::vector const& tokens, DataClusterEntry const& defaults, int startIndex) { + Optional entry; + Optional connectionString; + + std::set usedParams; + for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) { + StringRef token = tokens[tokenNum]; + bool foundEquals; + StringRef param = token.eat("=", &foundEquals); + if (!foundEquals) { + fmt::print(stderr, + "ERROR: invalid configuration string `{}'. String must specify a value using `='.\n", + param.toString().c_str()); + return {}; + } + std::string value = token.toString(); + if (!usedParams.insert(value).second) { + fmt::print( + stderr, "ERROR: configuration parameter `{}' specified more than once.\n", param.toString().c_str()); + return {}; + } + if (tokencmp(param, "max_tenant_groups")) { + entry = defaults; + + int n; + if (sscanf(value.c_str(), "%d%n", &entry.get().capacity.numTenantGroups, &n) != 1 || n != value.size() || + entry.get().capacity.numTenantGroups < 0) { + fmt::print(stderr, "ERROR: invalid number of tenant groups `{}'.\n", value.c_str()); + return {}; + } + } else if (tokencmp(param, "connection_string")) { + connectionString = ClusterConnectionString(value); + } else { + fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str()); + return {}; + } + } + + return std::make_pair(connectionString, entry); +} + +void printMetaclusterConfigureOptionsUsage() { + fmt::print("max_tenant_groups sets the maximum number of tenant groups that can be assigned\n" + "to the named data cluster.\n"); + fmt::print("connection_string sets the connection string for the named data cluster.\n"); +} + +// metacluster create command +ACTOR Future metaclusterCreateCommand(Reference db, std::vector tokens) { + if (tokens.size() != 3) { + fmt::print("Usage: metacluster create_experimental \n\n"); + fmt::print("Configures the cluster to be a management cluster in a metacluster.\n"); + fmt::print("NAME is an identifier used to distinguish this metacluster from other metaclusters.\n"); + return false; + } + + Optional errorStr = wait(MetaclusterAPI::createMetacluster(db, tokens[2])); + if (errorStr.present()) { + fmt::print("ERROR: {}.\n", errorStr.get()); + } else { + fmt::print("The cluster has been configured as a metacluster.\n"); + } + return true; +} + +// metacluster decommission command +ACTOR Future metaclusterDecommissionCommand(Reference db, std::vector tokens) { + if (tokens.size() != 2) { + fmt::print("Usage: metacluster decommission\n\n"); + fmt::print("Converts the current cluster from a metacluster management cluster back into an\n"); + fmt::print("ordinary cluster. It must be called on a cluster with no registered data clusters.\n"); + return false; + } + + wait(MetaclusterAPI::decommissionMetacluster(db)); + + fmt::print("The cluster is no longer a metacluster.\n"); + return true; +} + +// metacluster register command +ACTOR Future metaclusterRegisterCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4) { + fmt::print("Usage: metacluster register connection_string=\n" + "[max_tenant_groups=]\n\n"); + fmt::print("Adds a data cluster to a metacluster.\n"); + fmt::print("NAME is used to identify the cluster in future commands.\n"); + printMetaclusterConfigureOptionsUsage(); + return false; + } + + DataClusterEntry defaultEntry; + auto config = parseClusterConfiguration(tokens, defaultEntry, 3); + if (!config.present()) { + return false; + } else if (!config.get().first.present()) { + fmt::print(stderr, "ERROR: connection_string must be configured when registering a cluster.\n"); + return false; + } + + wait(MetaclusterAPI::registerCluster( + db, tokens[2], config.get().first.get(), config.get().second.orDefault(defaultEntry))); + + fmt::print("The cluster `{}' has been added\n", printable(tokens[2]).c_str()); + return true; +} + +// metacluster remove command +ACTOR Future metaclusterRemoveCommand(Reference db, std::vector tokens) { + if (tokens.size() < 3 || tokens.size() > 4 || (tokens.size() == 4 && tokens[2] != "FORCE"_sr)) { + fmt::print("Usage: metacluster remove [FORCE] \n\n"); + fmt::print("Removes the specified data cluster from a metacluster.\n"); + fmt::print("If FORCE is specified, then the cluster will be detached even if it has\n" + "tenants assigned to it.\n"); + return false; + } + + state ClusterNameRef clusterName = tokens[tokens.size() - 1]; + wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4)); + + fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); + return true; +} + +// metacluster configure command +ACTOR Future metaclusterConfigureCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4) { + fmt::print("Usage: metacluster configure |\n" + "connection_string=> ...\n\n"); + fmt::print("Updates the configuration of the metacluster.\n"); + printMetaclusterConfigureOptionsUsage(); + return false; + } + + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + Optional metadata = wait(MetaclusterAPI::tryGetClusterTransaction(tr, tokens[2])); + if (!metadata.present()) { + throw cluster_not_found(); + } + + auto config = parseClusterConfiguration(tokens, metadata.get().entry, 3); + if (!config.present()) { + return false; + } + + MetaclusterAPI::updateClusterMetadata( + tr, tokens[2], metadata.get(), config.get().first, config.get().second); + + wait(safeThreadFutureToFuture(tr->commit())); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return true; +} + +// metacluster list command +ACTOR Future metaclusterListCommand(Reference db, std::vector tokens) { + if (tokens.size() > 5) { + fmt::print("Usage: metacluster list [BEGIN] [END] [LIMIT]\n\n"); + fmt::print("Lists the data clusters in a metacluster.\n"); + fmt::print("Only cluster names in the range BEGIN - END will be printed.\n"); + fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n"); + return false; + } + + state ClusterNameRef begin = tokens.size() > 2 ? tokens[2] : ""_sr; + state ClusterNameRef end = tokens.size() > 3 ? tokens[3] : "\xff"_sr; + int limit = 100; + + if (tokens.size() > 4) { + int n = 0; + if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size() || limit < 0) { + fmt::print(stderr, "ERROR: invalid limit {}\n", tokens[3].toString().c_str()); + return false; + } + } + + std::map clusters = wait(MetaclusterAPI::listClusters(db, begin, end, limit)); + if (clusters.empty()) { + if (tokens.size() == 2) { + fmt::print("The metacluster has no registered data clusters\n"); + } else { + fmt::print("The metacluster has no registered data clusters in the specified range\n"); + } + } + + int index = 0; + for (auto cluster : clusters) { + fmt::print(" {}. {}\n", ++index, printable(cluster.first).c_str()); + } + + return true; +} + +// metacluster get command +ACTOR Future metaclusterGetCommand(Reference db, std::vector tokens) { + if (tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) { + fmt::print("Usage: metacluster get [JSON]\n\n"); + fmt::print("Prints metadata associated with the given data cluster.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); + return false; + } + + state bool useJson = tokens.size() == 4; + + try { + DataClusterMetadata metadata = wait(MetaclusterAPI::getCluster(db, tokens[2])); + + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "success"; + obj["cluster"] = metadata.toJson(); + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + } else { + fmt::print(" connection string: {}\n", metadata.connectionString.toString().c_str()); + fmt::print(" cluster state: {}\n", DataClusterEntry::clusterStateToString(metadata.entry.clusterState)); + fmt::print(" tenant group capacity: {}\n", metadata.entry.capacity.numTenantGroups); + fmt::print(" allocated tenant groups: {}\n", metadata.entry.allocated.numTenantGroups); + } + } catch (Error& e) { + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "error"; + obj["error"] = e.what(); + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + return false; + } else { + throw; + } + } + + return true; +} + +// metacluster status command +ACTOR Future metaclusterStatusCommand(Reference db, std::vector tokens) { + if (tokens.size() < 2 || tokens.size() > 3) { + fmt::print("Usage: metacluster status [JSON]\n\n"); + fmt::print("Prints metacluster metadata.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); + return false; + } + + state bool useJson = tokens.size() == 3; + + try { + std::map clusters = + wait(MetaclusterAPI::listClusters(db, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS)); + + ClusterUsage totalCapacity; + ClusterUsage totalAllocated; + for (auto cluster : clusters) { + totalCapacity.numTenantGroups += + std::max(cluster.second.entry.capacity.numTenantGroups, cluster.second.entry.allocated.numTenantGroups); + totalAllocated.numTenantGroups += cluster.second.entry.allocated.numTenantGroups; + } + + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "success"; + + json_spirit::mObject metaclusterObj; + metaclusterObj["data_clusters"] = (int)clusters.size(); + metaclusterObj["capacity"] = totalCapacity.toJson(); + metaclusterObj["allocated"] = totalAllocated.toJson(); + + obj["metacluster"] = metaclusterObj; + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + } else { + fmt::print(" number of data clusters: {}\n", clusters.size()); + fmt::print(" tenant group capacity: {}\n", totalCapacity.numTenantGroups); + fmt::print(" allocated tenant groups: {}\n", totalAllocated.numTenantGroups); + } + + return true; + } catch (Error& e) { + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "error"; + obj["error"] = e.what(); + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + return false; + } else { + throw; + } + } +} + +// metacluster command +Future metaclusterCommand(Reference db, std::vector tokens) { + if (tokens.size() == 1) { + printUsage(tokens[0]); + return true; + } else if (tokencmp(tokens[1], "create_experimental")) { + return metaclusterCreateCommand(db, tokens); + } else if (tokencmp(tokens[1], "decommission")) { + return metaclusterDecommissionCommand(db, tokens); + } else if (tokencmp(tokens[1], "register")) { + return metaclusterRegisterCommand(db, tokens); + } else if (tokencmp(tokens[1], "remove")) { + return metaclusterRemoveCommand(db, tokens); + } else if (tokencmp(tokens[1], "configure")) { + return metaclusterConfigureCommand(db, tokens); + } else if (tokencmp(tokens[1], "list")) { + return metaclusterListCommand(db, tokens); + } else if (tokencmp(tokens[1], "get")) { + return metaclusterGetCommand(db, tokens); + } else if (tokencmp(tokens[1], "status")) { + return metaclusterStatusCommand(db, tokens); + } else { + printUsage(tokens[0]); + return true; + } +} + +void metaclusterGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { + "create_experimental", "decommission", "register", "remove", "configure", "list", "get", "status", nullptr + }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() > 1 && (tokencmp(tokens[1], "register") || tokencmp(tokens[1], "configure"))) { + const char* opts[] = { "max_tenant_groups=", "connection_string=", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if ((tokens.size() == 2 && tokencmp(tokens[1], "status")) || + (tokens.size() == 3 && tokencmp(tokens[1], "get"))) { + const char* opts[] = { "JSON", nullptr }; + arrayGenerator(text, line, opts, lc); + } +} + +std::vector metaclusterHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "create_experimental")) { + return { "" }; + } else if (tokencmp(tokens[1], "decommission")) { + return {}; + } else if (tokencmp(tokens[1], "register") && tokens.size() < 5) { + static std::vector opts = { "", + "connection_string=", + "[max_tenant_groups=]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "remove") && tokens.size() < 4) { + static std::vector opts = { "[FORCE]", "" }; + if (tokens.size() == 2) { + return opts; + } else if (tokens.size() == 3 && (inArgument || tokens[2].size() == "FORCE"_sr.size()) && + "FORCE"_sr.startsWith(tokens[2])) { + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else { + return {}; + } + } else if (tokencmp(tokens[1], "configure")) { + static std::vector opts = { + "", "|connection_string=>" + }; + return std::vector(opts.begin() + std::min(1, tokens.size() - 2), opts.end()); + } else if (tokencmp(tokens[1], "list") && tokens.size() < 5) { + static std::vector opts = { "[BEGIN]", "[END]", "[LIMIT]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "get") && tokens.size() < 4) { + static std::vector opts = { "", "[JSON]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "status") && tokens.size() == 2) { + return { "[JSON]" }; + } else { + return {}; + } +} + +CommandFactory metaclusterRegisterFactory( + "metacluster", + CommandHelp("metacluster [ARGS]", + "view and manage a metacluster", + "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n" + "`register' and `remove' add and remove data clusters from the metacluster.\n" + "`configure' updates the configuration of a data cluster.\n" + "`list' prints a list of data clusters in the metacluster.\n" + "`get' prints the metadata for a particular data cluster.\n" + "`status' prints metacluster metadata.\n"), + &metaclusterGenerator, + &metaclusterHintGenerator); + +} // namespace fdb_cli diff --git a/fdbcli/ProfileCommand.actor.cpp b/fdbcli/ProfileCommand.actor.cpp index d7f66ce080..52325d3de8 100644 --- a/fdbcli/ProfileCommand.actor.cpp +++ b/fdbcli/ProfileCommand.actor.cpp @@ -97,8 +97,8 @@ ACTOR Future profileCommandActor(Database db, } } - Tuple rate = Tuple().appendDouble(sampleRate); - Tuple size = Tuple().append(sizeLimit); + Tuple rate = Tuple::makeTuple(sampleRate); + Tuple size = Tuple::makeTuple(sizeLimit); tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack()); tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack()); @@ -115,17 +115,13 @@ ACTOR Future profileCommandActor(Database db, return false; } // Hold the reference to the standalone's memory - state ThreadFuture kvsFuture = - tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), - LiteralStringRef("\xff\xff/worker_interfaces0")), - CLIENT_KNOBS->TOO_MANY); + state ThreadFuture kvsFuture = tr->getRange( + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr), CLIENT_KNOBS->TOO_MANY); RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); ASSERT(!kvs.more); for (const auto& pair : kvs) { - auto ip_port = - (pair.key.endsWith(LiteralStringRef(":tls")) ? pair.key.removeSuffix(LiteralStringRef(":tls")) - : pair.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); + auto ip_port = (pair.key.endsWith(":tls"_sr) ? pair.key.removeSuffix(":tls"_sr) : pair.key) + .removePrefix("\xff\xff/worker_interfaces/"_sr); printf("%s\n", printable(ip_port).c_str()); } } else { diff --git a/fdbcli/QuotaCommand.actor.cpp b/fdbcli/QuotaCommand.actor.cpp index ba8546fa15..e6a86e9b51 100644 --- a/fdbcli/QuotaCommand.actor.cpp +++ b/fdbcli/QuotaCommand.actor.cpp @@ -25,8 +25,6 @@ namespace { enum class LimitType { RESERVED, TOTAL }; -enum class OpType { READ, WRITE }; - Optional parseTag(StringRef token) { if (token.size() > CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH) { return {}; @@ -36,25 +34,15 @@ Optional parseTag(StringRef token) { } Optional parseLimitType(StringRef token) { - if (token == "reserved"_sr) { + if (token == "reserved_throughput"_sr) { return LimitType::RESERVED; - } else if (token == "total"_sr) { + } else if (token == "total_throughput"_sr) { return LimitType::TOTAL; } else { return {}; } } -Optional parseOpType(StringRef token) { - if (token == "read"_sr) { - return OpType::READ; - } else if (token == "write"_sr) { - return OpType::WRITE; - } else { - return {}; - } -} - Optional parseLimitValue(StringRef token) { try { return std::stod(token.toString()); @@ -63,7 +51,7 @@ Optional parseLimitValue(StringRef token) { } } -ACTOR Future getQuota(Reference db, TransactionTag tag, LimitType limitType, OpType opType) { +ACTOR Future getQuota(Reference db, TransactionTag tag, LimitType limitType) { state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -74,14 +62,10 @@ ACTOR Future getQuota(Reference db, TransactionTag tag, LimitTy fmt::print("\n"); } else { auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get()); - if (limitType == LimitType::TOTAL && opType == OpType::READ) { - fmt::print("{}\n", quota.totalReadQuota); - } else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) { - fmt::print("{}\n", quota.totalWriteQuota); - } else if (limitType == LimitType::RESERVED && opType == OpType::READ) { - fmt::print("{}\n", quota.reservedReadQuota); - } else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) { - fmt::print("{}\n", quota.reservedWriteQuota); + if (limitType == LimitType::TOTAL) { + fmt::print("{}\n", quota.totalQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR); + } else if (limitType == LimitType::RESERVED) { + fmt::print("{}\n", quota.reservedQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR); } } return Void(); @@ -91,11 +75,7 @@ ACTOR Future getQuota(Reference db, TransactionTag tag, LimitTy } } -ACTOR Future setQuota(Reference db, - TransactionTag tag, - LimitType limitType, - OpType opType, - double value) { +ACTOR Future setQuota(Reference db, TransactionTag tag, LimitType limitType, double value) { state Reference tr = db->createTransaction(); state Key key = tag.withPrefix(tagQuotaPrefix); loop { @@ -107,21 +87,14 @@ ACTOR Future setQuota(Reference db, if (v.present()) { quota = ThrottleApi::TagQuotaValue::fromValue(v.get()); } - if (limitType == LimitType::TOTAL && opType == OpType::READ) { - quota.totalReadQuota = value; - } else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) { - quota.totalWriteQuota = value; - } else if (limitType == LimitType::RESERVED && opType == OpType::READ) { - quota.reservedReadQuota = value; - } else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) { - quota.reservedWriteQuota = value; + // Internally, costs are stored in terms of pages, but in the API, + // costs are specified in terms of bytes + if (limitType == LimitType::TOTAL) { + quota.totalQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1; + } else if (limitType == LimitType::RESERVED) { + quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1; } - ThrottleApi::setTagQuota(tr, - tag, - quota.reservedReadQuota, - quota.totalReadQuota, - quota.reservedWriteQuota, - quota.totalWriteQuota); + ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota); wait(safeThreadFutureToFuture(tr->commit())); return Void(); } catch (Error& e) { @@ -130,8 +103,8 @@ ACTOR Future setQuota(Reference db, } } -constexpr auto usage = - "quota [get [reserved|total] [read|write]|set [reserved|total] [read|write] ]"; +constexpr auto usage = "quota [get [reserved_throughput|total_throughput] | set " + "[reserved_throughput|total_throughput] ]"; bool exitFailure() { fmt::print(usage); @@ -149,25 +122,24 @@ ACTOR Future quotaCommandActor(Reference db, std::vector setProcessClass(Reference db, KeyRef network_addre namespace fdb_cli { const KeyRangeRef processClassSourceSpecialKeyRange = - KeyRangeRef(LiteralStringRef("\xff\xff/configuration/process/class_source/"), - LiteralStringRef("\xff\xff/configuration/process/class_source0")); + KeyRangeRef("\xff\xff/configuration/process/class_source/"_sr, "\xff\xff/configuration/process/class_source0"_sr); const KeyRangeRef processClassTypeSpecialKeyRange = - KeyRangeRef(LiteralStringRef("\xff\xff/configuration/process/class_type/"), - LiteralStringRef("\xff\xff/configuration/process/class_type0")); + KeyRangeRef("\xff\xff/configuration/process/class_type/"_sr, "\xff\xff/configuration/process/class_type0"_sr); ACTOR Future setClassCommandActor(Reference db, std::vector tokens) { if (tokens.size() != 3 && tokens.size() != 1) { diff --git a/fdbcli/SnapshotCommand.actor.cpp b/fdbcli/SnapshotCommand.actor.cpp index 5bc7302f0c..7606101bba 100644 --- a/fdbcli/SnapshotCommand.actor.cpp +++ b/fdbcli/SnapshotCommand.actor.cpp @@ -40,7 +40,7 @@ ACTOR Future snapshotCommandActor(Reference db, std::vector statusCommandActor(Reference db, StatusObject _s = wait(StatusClient::statusFetcher(localDb)); s = _s; } else { - state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + state ThreadFuture> statusValueF = tr->get("\xff\xff/status/json"_sr); Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); if (!statusValue.present()) { fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); diff --git a/fdbcli/SuspendCommand.actor.cpp b/fdbcli/SuspendCommand.actor.cpp index 78a7fa1ed9..483ad4e445 100644 --- a/fdbcli/SuspendCommand.actor.cpp +++ b/fdbcli/SuspendCommand.actor.cpp @@ -43,7 +43,7 @@ ACTOR Future suspendCommandActor(Reference db, if (tokens.size() == 1) { // initialize worker interfaces address_interface->clear(); - wait(getWorkerInterfaces(tr, address_interface)); + wait(getWorkerInterfaces(tr, address_interface, true)); if (address_interface->size() == 0) { printf("\nNo addresses can be suspended.\n"); } else if (address_interface->size() == 1) { diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp index 49ea0c3443..8b1ce93e37 100644 --- a/fdbcli/TenantCommands.actor.cpp +++ b/fdbcli/TenantCommands.actor.cpp @@ -25,6 +25,7 @@ #include "fdbclient/IClientApi.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/MetaclusterManagement.actor.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/Schemas.h" @@ -35,166 +36,269 @@ namespace fdb_cli { -const KeyRangeRef tenantSpecialKeyRange(LiteralStringRef("\xff\xff/management/tenant/map/"), - LiteralStringRef("\xff\xff/management/tenant/map0")); +const KeyRangeRef tenantMapSpecialKeyRange("\xff\xff/management/tenant/map/"_sr, "\xff\xff/management/tenant/map0"_sr); +const KeyRangeRef tenantConfigSpecialKeyRange("\xff\xff/management/tenant/configure/"_sr, + "\xff\xff/management/tenant/configure0"_sr); +const KeyRangeRef tenantRenameSpecialKeyRange("\xff\xff/management/tenant/rename/"_sr, + "\xff\xff/management/tenant/rename0"_sr); -// createtenant command -ACTOR Future createTenantCommandActor(Reference db, std::vector tokens) { - if (tokens.size() != 2) { - printUsage(tokens[0]); +Optional, Optional>> +parseTenantConfiguration(std::vector const& tokens, int startIndex, bool allowUnset) { + std::map, Optional> configParams; + for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) { + Optional value; + + StringRef token = tokens[tokenNum]; + StringRef param; + if (allowUnset && token == "unset"_sr) { + if (++tokenNum == tokens.size()) { + fmt::print(stderr, "ERROR: `unset' specified without a configuration parameter.\n"); + return {}; + } + param = tokens[tokenNum]; + } else { + bool foundEquals; + param = token.eat("=", &foundEquals); + if (!foundEquals) { + fmt::print(stderr, + "ERROR: invalid configuration string `{}'. String must specify a value using `='.\n", + param.toString().c_str()); + return {}; + } + value = token; + } + + if (configParams.count(param)) { + fmt::print( + stderr, "ERROR: configuration parameter `{}' specified more than once.\n", param.toString().c_str()); + return {}; + } + + if (tokencmp(param, "tenant_group")) { + configParams[param] = value; + } else if (tokencmp(param, "assigned_cluster")) { + configParams[param] = value; + } else { + fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str()); + return {}; + } + } + + return configParams; +} + +Key makeConfigKey(TenantNameRef tenantName, StringRef configName) { + return tenantConfigSpecialKeyRange.begin.withSuffix(Tuple().append(tenantName).append(configName).pack()); +} + +void applyConfigurationToSpecialKeys(Reference tr, + TenantNameRef tenantName, + std::map, Optional> configuration) { + for (auto [configName, value] : configuration) { + if (configName == "assigned_cluster"_sr) { + fmt::print(stderr, "ERROR: assigned_cluster is only valid in metacluster configuration.\n"); + throw invalid_tenant_configuration(); + } + if (value.present()) { + tr->set(makeConfigKey(tenantName, configName), value.get()); + } else { + tr->clear(makeConfigKey(tenantName, configName)); + } + } +} + +// tenant create command +ACTOR Future tenantCreateCommand(Reference db, std::vector tokens) { + if (tokens.size() < 3 || tokens.size() > 5) { + fmt::print("Usage: tenant create [tenant_group=] [assigned_cluster=]\n\n"); + fmt::print("Creates a new tenant in the cluster with the specified name.\n"); + fmt::print("An optional group can be specified that will require this tenant\n"); + fmt::print("to be placed on the same cluster as other tenants in the same group.\n"); + fmt::print("An optional cluster name can be specified that this tenant will be placed in.\n"); return false; } - state Key tenantNameKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(tokens[1]); + state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); state Reference tr = db->createTransaction(); state bool doneExistenceCheck = false; + state Optional, Optional>> configuration = + parseTenantConfiguration(tokens, 3, false); + + if (!configuration.present()) { + return false; + } + loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - if (!doneExistenceCheck) { - // Hold the reference to the standalone's memory - state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); - Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); - if (existingTenant.present()) { - throw tenant_already_exists(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + TenantMapEntry tenantEntry; + for (auto const& [name, value] : configuration.get()) { + tenantEntry.configure(name, value); } - doneExistenceCheck = true; + wait(MetaclusterAPI::createTenant(db, tokens[2], tenantEntry)); + } else { + if (!doneExistenceCheck) { + // Hold the reference to the standalone's memory + state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); + Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); + if (existingTenant.present()) { + throw tenant_already_exists(); + } + doneExistenceCheck = true; + } + + tr->set(tenantNameKey, ValueRef()); + applyConfigurationToSpecialKeys(tr, tokens[2], configuration.get()); + wait(safeThreadFutureToFuture(tr->commit())); } - tr->set(tenantNameKey, ValueRef()); - wait(safeThreadFutureToFuture(tr->commit())); break; } catch (Error& e) { state Error err(e); if (e.code() == error_code_special_keys_api_failure) { - std::string errorMsgStr = wait(fdb_cli::getSpecialKeysFailureErrorMessage(tr)); - fprintf(stderr, "ERROR: %s\n", errorMsgStr.c_str()); + std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr)); + fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str()); return false; } wait(safeThreadFutureToFuture(tr->onError(err))); } } - printf("The tenant `%s' has been created\n", printable(tokens[1]).c_str()); + fmt::print("The tenant `{}' has been created\n", printable(tokens[2]).c_str()); return true; } -CommandFactory createTenantFactory("createtenant", - CommandHelp("createtenant ", - "creates a new tenant in the cluster", - "Creates a new tenant in the cluster with the specified name.")); - -// deletetenant command -ACTOR Future deleteTenantCommandActor(Reference db, std::vector tokens) { - if (tokens.size() != 2) { - printUsage(tokens[0]); +// tenant delete command +ACTOR Future tenantDeleteCommand(Reference db, std::vector tokens) { + if (tokens.size() != 3) { + fmt::print("Usage: tenant delete \n\n"); + fmt::print("Deletes a tenant from the cluster.\n"); + fmt::print("Deletion will be allowed only if the specified tenant contains no data.\n"); return false; } - state Key tenantNameKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(tokens[1]); + state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); state Reference tr = db->createTransaction(); state bool doneExistenceCheck = false; loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - if (!doneExistenceCheck) { - // Hold the reference to the standalone's memory - state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); - Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); - if (!existingTenant.present()) { - throw tenant_not_found(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(MetaclusterAPI::deleteTenant(db, tokens[2])); + } else { + if (!doneExistenceCheck) { + // Hold the reference to the standalone's memory + state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); + Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); + if (!existingTenant.present()) { + throw tenant_not_found(); + } + doneExistenceCheck = true; } - doneExistenceCheck = true; + + tr->clear(tenantNameKey); + wait(safeThreadFutureToFuture(tr->commit())); } - tr->clear(tenantNameKey); - wait(safeThreadFutureToFuture(tr->commit())); break; } catch (Error& e) { state Error err(e); if (e.code() == error_code_special_keys_api_failure) { - std::string errorMsgStr = wait(fdb_cli::getSpecialKeysFailureErrorMessage(tr)); - fprintf(stderr, "ERROR: %s\n", errorMsgStr.c_str()); + std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr)); + fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str()); return false; } wait(safeThreadFutureToFuture(tr->onError(err))); } } - printf("The tenant `%s' has been deleted\n", printable(tokens[1]).c_str()); + fmt::print("The tenant `{}' has been deleted\n", printable(tokens[2]).c_str()); return true; } -CommandFactory deleteTenantFactory( - "deletetenant", - CommandHelp( - "deletetenant ", - "deletes a tenant from the cluster", - "Deletes a tenant from the cluster. Deletion will be allowed only if the specified tenant contains no data.")); - -// listtenants command -ACTOR Future listTenantsCommandActor(Reference db, std::vector tokens) { - if (tokens.size() > 4) { - printUsage(tokens[0]); +// tenant list command +ACTOR Future tenantListCommand(Reference db, std::vector tokens) { + if (tokens.size() > 5) { + fmt::print("Usage: tenant list [BEGIN] [END] [LIMIT]\n\n"); + fmt::print("Lists the tenants in a cluster.\n"); + fmt::print("Only tenants in the range BEGIN - END will be printed.\n"); + fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n"); return false; } - StringRef beginTenant = ""_sr; - StringRef endTenant = "\xff\xff"_sr; + state StringRef beginTenant = ""_sr; + state StringRef endTenant = "\xff\xff"_sr; state int limit = 100; - if (tokens.size() >= 2) { - beginTenant = tokens[1]; - } if (tokens.size() >= 3) { - endTenant = tokens[2]; + beginTenant = tokens[2]; + } + if (tokens.size() >= 4) { + endTenant = tokens[3]; if (endTenant <= beginTenant) { - fprintf(stderr, "ERROR: end must be larger than begin"); + fmt::print(stderr, "ERROR: end must be larger than begin"); return false; } } - if (tokens.size() == 4) { + if (tokens.size() == 5) { int n = 0; - if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size()) { - fprintf(stderr, "ERROR: invalid limit %s\n", tokens[3].toString().c_str()); + if (sscanf(tokens[4].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[4].size() || limit <= 0) { + fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[4].toString().c_str()); return false; } } - state Key beginTenantKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(beginTenant); - state Key endTenantKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(endTenant); + state Key beginTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(beginTenant); + state Key endTenantKey = tenantMapSpecialKeyRange.begin.withSuffix(endTenant); state Reference tr = db->createTransaction(); loop { try { - // Hold the reference to the standalone's memory - state ThreadFuture kvsFuture = - tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit); - RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture)); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::vector tenantNames; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + std::vector> tenants = + wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit)); + for (auto tenant : tenants) { + tenantNames.push_back(tenant.first); + } + } else { + // Hold the reference to the standalone's memory + state ThreadFuture kvsFuture = + tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit); + RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture)); + for (auto tenant : tenants) { + tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange.begin)); + } + } - if (tenants.empty()) { - if (tokens.size() == 1) { - printf("The cluster has no tenants\n"); + if (tenantNames.empty()) { + if (tokens.size() == 2) { + fmt::print("The cluster has no tenants\n"); } else { - printf("The cluster has no tenants in the specified range\n"); + fmt::print("The cluster has no tenants in the specified range\n"); } } int index = 0; - for (auto tenant : tenants) { - printf(" %d. %s\n", - ++index, - printable(tenant.key.removePrefix(fdb_cli::tenantSpecialKeyRange.begin)).c_str()); + for (auto tenantName : tenantNames) { + fmt::print(" {}. {}\n", ++index, printable(tenantName).c_str()); } return true; } catch (Error& e) { state Error err(e); if (e.code() == error_code_special_keys_api_failure) { - std::string errorMsgStr = wait(fdb_cli::getSpecialKeysFailureErrorMessage(tr)); - fprintf(stderr, "ERROR: %s\n", errorMsgStr.c_str()); + std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr)); + fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str()); return false; } wait(safeThreadFutureToFuture(tr->onError(err))); @@ -202,59 +306,73 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< } } -CommandFactory listTenantsFactory( - "listtenants", - CommandHelp("listtenants [BEGIN] [END] [LIMIT]", - "print a list of tenants in the cluster", - "Print a list of tenants in the cluster. Only tenants in the range [BEGIN] - [END] will be printed. " - "The number of tenants to print can be specified using the [LIMIT] parameter, which defaults to 100.")); - -// gettenant command -ACTOR Future getTenantCommandActor(Reference db, std::vector tokens, int apiVersion) { - if (tokens.size() < 2 || tokens.size() > 3 || (tokens.size() == 3 && tokens[2] != "JSON"_sr)) { - printUsage(tokens[0]); +// tenant get command +ACTOR Future tenantGetCommand(Reference db, std::vector tokens) { + if (tokens.size() < 3 || tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) { + fmt::print("Usage: tenant get [JSON]\n\n"); + fmt::print("Prints metadata associated with the given tenant.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); return false; } - state bool useJson = tokens.size() == 3; - state Key tenantNameKey = fdb_cli::tenantSpecialKeyRange.begin.withSuffix(tokens[1]); + state bool useJson = tokens.size() == 4; + state Key tenantNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); state Reference tr = db->createTransaction(); loop { try { - // Hold the reference to the standalone's memory - state ThreadFuture> tenantFuture = tr->get(tenantNameKey); - Optional tenant = wait(safeThreadFutureToFuture(tenantFuture)); - if (!tenant.present()) { - throw tenant_not_found(); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::string tenantJson; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + TenantMapEntry entry = wait(MetaclusterAPI::getTenantTransaction(tr, tokens[2])); + tenantJson = entry.toJson(); + } else { + // Hold the reference to the standalone's memory + state ThreadFuture> tenantFuture = tr->get(tenantNameKey); + Optional tenant = wait(safeThreadFutureToFuture(tenantFuture)); + if (!tenant.present()) { + throw tenant_not_found(); + } + tenantJson = tenant.get().toString(); } json_spirit::mValue jsonObject; - json_spirit::read_string(tenant.get().toString(), jsonObject); + json_spirit::read_string(tenantJson, jsonObject); if (useJson) { json_spirit::mObject resultObj; resultObj["tenant"] = jsonObject; resultObj["type"] = "success"; - printf("%s\n", - json_spirit::write_string(json_spirit::mValue(resultObj), json_spirit::pretty_print).c_str()); + fmt::print( + "{}\n", + json_spirit::write_string(json_spirit::mValue(resultObj), json_spirit::pretty_print).c_str()); } else { JSONDoc doc(jsonObject); int64_t id; std::string prefix; + std::string tenantState; + std::string tenantGroup; + std::string assignedCluster; doc.get("id", id); - if (apiVersion >= 720) { - doc.get("prefix.printable", prefix); - } else { - doc.get("prefix", prefix); + doc.get("prefix.printable", prefix); + + doc.get("tenant_state", tenantState); + bool hasTenantGroup = doc.tryGet("tenant_group.printable", tenantGroup); + bool hasAssignedCluster = doc.tryGet("assigned_cluster", assignedCluster); + + fmt::print(" id: {}\n", id); + fmt::print(" prefix: {}\n", printable(prefix).c_str()); + fmt::print(" tenant state: {}\n", printable(tenantState).c_str()); + if (hasTenantGroup) { + fmt::print(" tenant group: {}\n", tenantGroup.c_str()); + } + if (hasAssignedCluster) { + fmt::print(" assigned cluster: {}\n", printable(assignedCluster).c_str()); } - - printf(" id: %" PRId64 "\n", id); - printf(" prefix: %s\n", prefix.c_str()); } - return true; } catch (Error& e) { try { @@ -274,11 +392,11 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector getTenantCommandActor(Reference db, std::vector [JSON]", - "prints the metadata for a tenant", - "Prints the metadata for a tenant. If JSON is specified, then the output will be in JSON format.")); - -// renametenant command -ACTOR Future renameTenantCommandActor(Reference db, std::vector tokens) { - if (tokens.size() != 3) { - printUsage(tokens[0]); +// tenant configure command +ACTOR Future tenantConfigureCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4) { + fmt::print("Usage: tenant configure <[unset] tenant_group[=]> ...\n\n"); + fmt::print("Updates the configuration for a tenant.\n"); + fmt::print("Use `tenant_group=' to change the tenant group that a\n"); + fmt::print("tenant is assigned to or `unset tenant_group' to remove a tenant from\n"); + fmt::print("its tenant group."); return false; } - wait(safeThreadFutureToFuture(TenantAPI::renameTenant(db, tokens[1], tokens[2]))); - printf("The tenant `%s' has been renamed to `%s'\n", printable(tokens[1]).c_str(), printable(tokens[2]).c_str()); + state Optional, Optional>> configuration = + parseTenantConfiguration(tokens, 3, true); + + if (!configuration.present()) { + return false; + } + + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + TenantMapEntry tenantEntry; + wait(MetaclusterAPI::configureTenant(db, tokens[2], configuration.get())); + } else { + applyConfigurationToSpecialKeys(tr, tokens[2], configuration.get()); + wait(safeThreadFutureToFuture(tr->commit())); + } + break; + } catch (Error& e) { + state Error err(e); + if (e.code() == error_code_special_keys_api_failure) { + std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr)); + fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str()); + return false; + } + wait(safeThreadFutureToFuture(tr->onError(err))); + } + } + + fmt::print("The configuration for tenant `{}' has been updated\n", printable(tokens[2]).c_str()); return true; } -CommandFactory renameTenantFactory( - "renametenant", - CommandHelp( - "renametenant ", - "renames a tenant in the cluster.", - "Renames a tenant in the cluster. The old name must exist and the new name must not exist in the cluster.")); +// Helper function to extract tenant ID from json metadata string +int64_t getTenantId(Value metadata) { + json_spirit::mValue jsonObject; + json_spirit::read_string(metadata.toString(), jsonObject); + JSONDoc doc(jsonObject); + int64_t id; + doc.get("id", id); + return id; +} + +// tenant rename command +ACTOR Future tenantRenameCommand(Reference db, std::vector tokens) { + if (tokens.size() != 4) { + fmt::print("Usage: tenant rename \n\n"); + fmt::print("Renames a tenant in the cluster. The old name must exist and the new\n"); + fmt::print("name must not exist in the cluster.\n"); + return false; + } + state Reference tr = db->createTransaction(); + state Key tenantRenameKey = tenantRenameSpecialKeyRange.begin.withSuffix(tokens[2]); + state Key tenantOldNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[2]); + state Key tenantNewNameKey = tenantMapSpecialKeyRange.begin.withSuffix(tokens[3]); + state bool firstTry = true; + state int64_t id = -1; + loop { + try { + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(MetaclusterAPI::renameTenant(db, tokens[2], tokens[3])); + } else { + // Hold the reference to the standalone's memory + state ThreadFuture> oldEntryFuture = tr->get(tenantOldNameKey); + state ThreadFuture> newEntryFuture = tr->get(tenantNewNameKey); + state Optional oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture)); + state Optional newEntry = wait(safeThreadFutureToFuture(newEntryFuture)); + if (firstTry) { + if (!oldEntry.present()) { + throw tenant_not_found(); + } + if (newEntry.present()) { + throw tenant_already_exists(); + } + // Store the id we see when first reading this key + id = getTenantId(oldEntry.get()); + + firstTry = false; + } else { + // If we got commit_unknown_result, the rename may have already occurred. + if (newEntry.present()) { + int64_t checkId = getTenantId(newEntry.get()); + if (id == checkId) { + ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id); + return true; + } + // If the new entry is present but does not match, then + // the rename should fail, so we throw an error. + throw tenant_already_exists(); + } + if (!oldEntry.present()) { + throw tenant_not_found(); + } + int64_t checkId = getTenantId(oldEntry.get()); + // If the id has changed since we made our first attempt, + // then it's possible we've already moved the tenant. Don't move it again. + if (id != checkId) { + throw tenant_not_found(); + } + } + tr->set(tenantRenameKey, tokens[3]); + wait(safeThreadFutureToFuture(tr->commit())); + } + break; + } catch (Error& e) { + state Error err(e); + if (e.code() == error_code_special_keys_api_failure) { + std::string errorMsgStr = wait(getSpecialKeysFailureErrorMessage(tr)); + fmt::print(stderr, "ERROR: {}\n", errorMsgStr.c_str()); + return false; + } + wait(safeThreadFutureToFuture(tr->onError(err))); + } + } + + fmt::print( + "The tenant `{}' has been renamed to `{}'\n", printable(tokens[2]).c_str(), printable(tokens[3]).c_str()); + return true; +} + +// tenant command +Future tenantCommand(Reference db, std::vector tokens) { + if (tokens.size() == 1) { + printUsage(tokens[0]); + return true; + } else if (tokencmp(tokens[1], "create")) { + return tenantCreateCommand(db, tokens); + } else if (tokencmp(tokens[1], "delete")) { + return tenantDeleteCommand(db, tokens); + } else if (tokencmp(tokens[1], "list")) { + return tenantListCommand(db, tokens); + } else if (tokencmp(tokens[1], "get")) { + return tenantGetCommand(db, tokens); + } else if (tokencmp(tokens[1], "configure")) { + return tenantConfigureCommand(db, tokens); + } else if (tokencmp(tokens[1], "rename")) { + return tenantRenameCommand(db, tokens); + } else { + printUsage(tokens[0]); + return true; + } +} + +Future tenantCommandForwarder(Reference db, std::vector tokens) { + ASSERT(!tokens.empty() && (tokens[0].endsWith("tenant"_sr) || tokens[0].endsWith("tenants"_sr))); + std::vector forwardedTokens = { "tenant"_sr, + tokens[0].endsWith("tenant"_sr) ? tokens[0].removeSuffix("tenant"_sr) + : tokens[0].removeSuffix("tenants"_sr) }; + for (int i = 1; i < tokens.size(); ++i) { + forwardedTokens.push_back(tokens[i]); + } + + return tenantCommand(db, forwardedTokens); +} // namespace fdb_cli + +void tenantGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { "create", "delete", "list", "get", "configure", "rename", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3 && tokencmp(tokens[1], "create")) { + const char* opts[] = { "tenant_group=", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3 && tokencmp(tokens[1], "get")) { + const char* opts[] = { "JSON", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokencmp(tokens[1], "configure")) { + if (tokens.size() == 3) { + const char* opts[] = { "tenant_group=", "unset", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 4 && tokencmp(tokens[3], "unset")) { + const char* opts[] = { "tenant_group", nullptr }; + arrayGenerator(text, line, opts, lc); + } + } +} + +std::vector tenantHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "create") && tokens.size() < 4) { + static std::vector opts = { " [tenant_group=]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "delete") && tokens.size() < 3) { + static std::vector opts = { "" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "list") && tokens.size() < 5) { + static std::vector opts = { "[BEGIN]", "[END]", "[LIMIT]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "get") && tokens.size() < 4) { + static std::vector opts = { "", "[JSON]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "configure")) { + if (tokens.size() < 4) { + static std::vector opts = { "", "<[unset] tenant_group[=]>" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokens.size() == 4 && tokencmp(tokens[3], "unset")) { + static std::vector opts = { "]>" }; + return std::vector(opts.begin() + tokens.size() - 4, opts.end()); + } + return {}; + } else if (tokencmp(tokens[1], "rename") && tokens.size() < 4) { + static std::vector opts = { "", "" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else { + return {}; + } +} + +CommandFactory tenantRegisterFactory("tenant", + CommandHelp("tenant [ARGS]", + "view and manage tenants in a cluster or metacluster", + "`create' and `delete' add and remove tenants from the cluster.\n" + "`list' prints a list of tenants in the cluster.\n" + "`get' prints the metadata for a particular tenant.\n" + "`configure' modifies the configuration for a tenant.\n" + "`rename' changes the name of a tenant.\n"), + &tenantGenerator, + &tenantHintGenerator); + +// Generate hidden commands for the old versions of the tenant commands +CommandFactory createTenantFactory("createtenant"); +CommandFactory deleteTenantFactory("deletetenant"); +CommandFactory listTenantsFactory("listtenants"); +CommandFactory getTenantFactory("gettenant"); +CommandFactory configureTenantFactory("configuretenant"); +CommandFactory renameTenantFactory("renametenant"); + } // namespace fdb_cli diff --git a/fdbcli/TenantGroupCommands.actor.cpp b/fdbcli/TenantGroupCommands.actor.cpp new file mode 100644 index 0000000000..6a89360aeb --- /dev/null +++ b/fdbcli/TenantGroupCommands.actor.cpp @@ -0,0 +1,240 @@ +/* + * TenantGroupCommands.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/GenericManagementAPI.actor.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/Schemas.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace fdb_cli { + +// tenantgroup list command +ACTOR Future tenantGroupListCommand(Reference db, std::vector tokens) { + if (tokens.size() > 5) { + fmt::print("Usage: tenantgroup list [BEGIN] [END] [LIMIT]\n\n"); + fmt::print("Lists the tenant groups in a cluster.\n"); + fmt::print("Only tenant groups in the range BEGIN - END will be printed.\n"); + fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n"); + return false; + } + + state StringRef beginTenantGroup = ""_sr; + state StringRef endTenantGroup = "\xff\xff"_sr; + state int limit = 100; + + if (tokens.size() >= 3) { + beginTenantGroup = tokens[2]; + } + if (tokens.size() >= 4) { + endTenantGroup = tokens[3]; + if (endTenantGroup <= beginTenantGroup) { + fmt::print(stderr, "ERROR: end must be larger than begin"); + return false; + } + } + if (tokens.size() == 5) { + int n = 0; + if (sscanf(tokens[4].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[4].size() || limit <= 0) { + fmt::print(stderr, "ERROR: invalid limit `{}'\n", tokens[4].toString()); + return false; + } + } + + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::vector tenantGroupNames; + state std::vector> tenantGroups; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(store(tenantGroups, + MetaclusterAPI::listTenantGroupsTransaction(tr, beginTenantGroup, endTenantGroup, limit))); + } else { + wait(store(tenantGroups, + TenantAPI::listTenantGroupsTransaction(tr, beginTenantGroup, endTenantGroup, limit))); + } + + if (tenantGroups.empty()) { + if (tokens.size() == 2) { + fmt::print("The cluster has no tenant groups\n"); + } else { + fmt::print("The cluster has no tenant groups in the specified range\n"); + } + } + + int index = 0; + for (auto tenantGroup : tenantGroups) { + fmt::print(" {}. {}\n", ++index, printable(tenantGroup.first)); + } + + return true; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +// tenantgroup get command +ACTOR Future tenantGroupGetCommand(Reference db, std::vector tokens) { + if (tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) { + fmt::print("Usage: tenantgroup get [JSON]\n\n"); + fmt::print("Prints metadata associated with the given tenant group.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); + return false; + } + + state bool useJson = tokens.size() == 4; + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::string tenantJson; + state Optional entry; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(store(entry, MetaclusterAPI::tryGetTenantGroupTransaction(tr, tokens[2]))); + } else { + wait(store(entry, TenantAPI::tryGetTenantGroupTransaction(tr, tokens[2]))); + Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + // We don't store assigned clusters in the tenant group entry on data clusters, so we can instead + // populate it from the metacluster registration + if (entry.present() && metaclusterRegistration.present() && + metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA && + !entry.get().assignedCluster.present()) { + entry.get().assignedCluster = metaclusterRegistration.get().name; + } + } + + if (!entry.present()) { + throw tenant_not_found(); + } + + if (useJson) { + json_spirit::mObject resultObj; + resultObj["tenant_group"] = entry.get().toJson(); + resultObj["type"] = "success"; + fmt::print("{}\n", + json_spirit::write_string(json_spirit::mValue(resultObj), json_spirit::pretty_print)); + } else { + if (entry.get().assignedCluster.present()) { + fmt::print(" assigned cluster: {}\n", printable(entry.get().assignedCluster)); + } else { + // This is a placeholder output for when a tenant group is read in a non-metacluster, where + // it currently has no metadata. When metadata is eventually added, we can print that instead. + fmt::print("The tenant group is present in the cluster\n"); + } + } + return true; + } catch (Error& e) { + try { + wait(safeThreadFutureToFuture(tr->onError(e))); + } catch (Error& finalErr) { + state std::string errorStr; + if (finalErr.code() == error_code_tenant_not_found) { + errorStr = "tenant group not found"; + } else if (useJson) { + errorStr = finalErr.what(); + } else { + throw finalErr; + } + + if (useJson) { + json_spirit::mObject resultObj; + resultObj["type"] = "error"; + resultObj["error"] = errorStr; + fmt::print("{}\n", + json_spirit::write_string(json_spirit::mValue(resultObj), json_spirit::pretty_print)); + } else { + fmt::print(stderr, "ERROR: {}\n", errorStr); + } + + return false; + } + } + } +} + +// tenantgroup command +Future tenantGroupCommand(Reference db, std::vector tokens) { + if (tokens.size() == 1) { + printUsage(tokens[0]); + return true; + } else if (tokencmp(tokens[1], "list")) { + return tenantGroupListCommand(db, tokens); + } else if (tokencmp(tokens[1], "get")) { + return tenantGroupGetCommand(db, tokens); + } else { + printUsage(tokens[0]); + return true; + } +} + +void tenantGroupGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { "list", "get", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3 && tokencmp(tokens[1], "get")) { + const char* opts[] = { "JSON", nullptr }; + arrayGenerator(text, line, opts, lc); + } +} + +std::vector tenantGroupHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "list") && tokens.size() < 5) { + static std::vector opts = { "[BEGIN]", "[END]", "[LIMIT]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "get") && tokens.size() < 4) { + static std::vector opts = { "", "[JSON]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else { + return {}; + } +} + +CommandFactory tenantGroupRegisterFactory("tenantgroup", + CommandHelp("tenantgroup [ARGS]", + "view tenant group information", + "`list' prints a list of tenant groups in the cluster.\n" + "`get' prints the metadata for a particular tenant group.\n"), + &tenantGroupGenerator, + &tenantGroupHintGenerator); + +} // namespace fdb_cli diff --git a/fdbcli/ThrottleCommand.actor.cpp b/fdbcli/ThrottleCommand.actor.cpp index abff0e0475..057a83c78f 100644 --- a/fdbcli/ThrottleCommand.actor.cpp +++ b/fdbcli/ThrottleCommand.actor.cpp @@ -163,11 +163,11 @@ ACTOR Future throttleCommandActor(Reference db, std::vector tssQuarantine(Reference db, bool enable, UID tssId } if (enable) { - tr->set(tssQuarantineKeyFor(tssId), LiteralStringRef("")); + tr->set(tssQuarantineKeyFor(tssId), ""_sr); // remove server from TSS mapping when quarantine is enabled tssMapDB.erase(tr, ssi.tssPairID.get()); } else { @@ -112,19 +112,19 @@ namespace fdb_cli { ACTOR Future tssqCommandActor(Reference db, std::vector tokens) { if (tokens.size() == 2) { - if (tokens[1] != LiteralStringRef("list")) { + if (tokens[1] != "list"_sr) { printUsage(tokens[0]); return false; } else { wait(tssQuarantineList(db)); } } else if (tokens.size() == 3) { - if ((tokens[1] != LiteralStringRef("start") && tokens[1] != LiteralStringRef("stop")) || - (tokens[2].size() != 32) || !std::all_of(tokens[2].begin(), tokens[2].end(), &isxdigit)) { + if ((tokens[1] != "start"_sr && tokens[1] != "stop"_sr) || (tokens[2].size() != 32) || + !std::all_of(tokens[2].begin(), tokens[2].end(), &isxdigit)) { printUsage(tokens[0]); return false; } else { - bool enable = tokens[1] == LiteralStringRef("start"); + bool enable = tokens[1] == "start"_sr; UID tssId = UID::fromString(tokens[2].toString()); bool success = wait(tssQuarantine(db, enable, tssId)); return success; diff --git a/fdbcli/Util.actor.cpp b/fdbcli/Util.actor.cpp index d40a5dcaeb..aed1133047 100644 --- a/fdbcli/Util.actor.cpp +++ b/fdbcli/Util.actor.cpp @@ -62,56 +62,49 @@ ACTOR Future getSpecialKeysFailureErrorMessage(Reference verifyAndAddInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv) { - wait(connectLock->take()); - state FlowLock::Releaser releaser(*connectLock); - state ClientWorkerInterface workerInterf; - try { - // the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version - workerInterf = BinaryReader::fromStringRef(kv.value, IncludeVersion()); - } catch (Error& e) { - fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what()); - return Void(); - } - state ClientLeaderRegInterface leaderInterf(workerInterf.address()); - choose { - when(Optional rep = - wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) { - StringRef ip_port = - (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); - (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); - - if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { - Key full_ip_port2 = - StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) - ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) - : full_ip_port2; - (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); - } +void addInterfacesFromKVs(RangeResult& kvs, + std::map>* address_interface) { + for (const auto& kv : kvs) { + ClientWorkerInterface workerInterf; + try { + // the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version + workerInterf = BinaryReader::fromStringRef(kv.value, IncludeVersion()); + } catch (Error& e) { + fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what()); + return; + } + ClientLeaderRegInterface leaderInterf(workerInterf.address()); + StringRef ip_port = (kv.key.endsWith(":tls"_sr) ? kv.key.removeSuffix(":tls"_sr) : kv.key) + .removePrefix("\xff\xff/worker_interfaces/"_sr); + (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); + + if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { + Key full_ip_port2 = + StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); + StringRef ip_port2 = + full_ip_port2.endsWith(":tls"_sr) ? full_ip_port2.removeSuffix(":tls"_sr) : full_ip_port2; + (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); } - when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {} } - return Void(); } ACTOR Future getWorkerInterfaces(Reference tr, - std::map>* address_interface) { + std::map>* address_interface, + bool verify) { + if (verify) { + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->set(workerInterfacesVerifyOptionSpecialKey, ValueRef()); + } // Hold the reference to the standalone's memory state ThreadFuture kvsFuture = tr->getRange( - KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")), - CLIENT_KNOBS->TOO_MANY); - RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr), CLIENT_KNOBS->TOO_MANY); + state RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); ASSERT(!kvs.more); - auto connectLock = makeReference(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM); - std::vector> addInterfs; - for (auto it : kvs) { - addInterfs.push_back(verifyAndAddInterface(address_interface, connectLock, it)); + if (verify) { + // remove the option if set + tr->clear(workerInterfacesVerifyOptionSpecialKey); } - wait(waitForAll(addInterfs)); + addInterfacesFromKVs(kvs, address_interface); return Void(); } diff --git a/fdbcli/VersionEpochCommand.actor.cpp b/fdbcli/VersionEpochCommand.actor.cpp index 7d073e590d..a9dcd7e198 100644 --- a/fdbcli/VersionEpochCommand.actor.cpp +++ b/fdbcli/VersionEpochCommand.actor.cpp @@ -32,7 +32,7 @@ namespace fdb_cli { -const KeyRef versionEpochSpecialKey = LiteralStringRef("\xff\xff/management/version_epoch"); +const KeyRef versionEpochSpecialKey = "\xff\xff/management/version_epoch"_sr; struct VersionInfo { int64_t version; diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index ba01fa3c22..a5c2e2e75a 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -44,11 +44,13 @@ #include "fdbclient/ThreadSafeTransaction.h" #include "flow/flow.h" +#include "flow/ApiVersion.h" #include "flow/ArgParseUtil.h" #include "flow/DeterministicRandom.h" #include "flow/FastRef.h" #include "flow/Platform.h" #include "flow/SystemMonitor.h" +#include "flow/CodeProbe.h" #include "flow/TLSConfig.actor.h" #include "flow/ThreadHelper.actor.h" @@ -72,7 +74,6 @@ #include "flow/actorcompiler.h" // This must be the last #include. -#define FDB_API_VERSION 720 /* * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi * instance (e.g. from ThreadSafeApi) @@ -102,6 +103,7 @@ enum { OPT_DEBUG_TLS, OPT_API_VERSION, OPT_MEMORY, + OPT_USE_FUTURE_PROTOCOL_VERSION }; CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, @@ -126,6 +128,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, { OPT_DEBUG_TLS, "--debug-tls", SO_NONE }, { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, { OPT_MEMORY, "--memory", SO_REQ_SEP }, + { OPT_USE_FUTURE_PROTOCOL_VERSION, "--use-future-protocol-version", SO_NONE }, TLS_OPTION_FLAGS, SO_END_OF_OPTIONS }; @@ -474,6 +477,9 @@ static void printProgramUsage(const char* name) { " Useful in reporting and diagnosing TLS issues.\n" " --build-flags Print build information and exit.\n" " --memory Resident memory limit of the CLI (defaults to 8GiB).\n" + " --use-future-protocol-version\n" + " Use the simulated future protocol version to connect to the cluster.\n" + " This option can be used testing purposes only!\n" " -v, --version Print FoundationDB CLI version information and exit.\n" " -h, --help Display this help and exit.\n"); } @@ -531,10 +537,10 @@ void initHelp() { CommandHelp("getversion", "Fetch the current read version", "Displays the current read version of the database or currently running transaction."); - helpMap["quota"] = - CommandHelp("quota", - "quota [get [reserved|total] [read|write]|set [reserved|total] [read|write] ]", - "Get or modify the throughput quota for the specified tag."); + helpMap["quota"] = CommandHelp("quota", + "quota [get [reserved_throughput|total_throughput] | set " + "[reserved_throughput|total_throughput] ]", + "Get or modify the throughput quota for the specified tag."); helpMap["reset"] = CommandHelp("reset", "reset the current transaction", @@ -577,7 +583,7 @@ void initHelp() { void printVersion() { printf("FoundationDB CLI " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); printf("source version %s\n", getSourceVersion()); - printf("protocol %" PRIx64 "\n", currentProtocolVersion.version()); + printf("protocol %" PRIx64 "\n", currentProtocolVersion().version()); } void printBuildInformation() { @@ -648,7 +654,7 @@ ACTOR Future checkStatus(Future f, StatusObject _s = wait(StatusClient::statusFetcher(localDb)); s = _s; } else { - state ThreadFuture> statusValueF = tr->get(LiteralStringRef("\xff\xff/status/json")); + state ThreadFuture> statusValueF = tr->get("\xff\xff/status/json"_sr); Optional statusValue = wait(safeThreadFutureToFuture(statusValueF)); if (!statusValue.present()) { fprintf(stderr, "ERROR: Failed to get status json from the cluster\n"); @@ -692,7 +698,7 @@ ACTOR Future createSnapshot(Database db, std::vector tokens) { for (int i = 1; i < tokens.size(); i++) { snapCmd = snapCmd.withSuffix(tokens[i]); if (i != tokens.size() - 1) { - snapCmd = snapCmd.withSuffix(LiteralStringRef(" ")); + snapCmd = snapCmd.withSuffix(" "_sr); } } try { @@ -871,6 +877,7 @@ struct CLIOptions { Optional exec; bool initialStatusCheck = true; bool cliHints = true; + bool useFutureProtocolVersion = false; bool debugTLS = false; std::string tlsCertPath; std::string tlsKeyPath; @@ -882,7 +889,7 @@ struct CLIOptions { std::vector> knobs; // api version, using the latest version by default - int apiVersion = FDB_API_VERSION; + int apiVersion = ApiVersion::LATEST_VERSION; CLIOptions(int argc, char* argv[]) { program_name = argv[0]; @@ -931,12 +938,12 @@ struct CLIOptions { if (*endptr != '\0') { fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg()); return 1; - } else if (apiVersion < 700 || apiVersion > FDB_API_VERSION) { + } else if (apiVersion < 700 || apiVersion > ApiVersion::LATEST_VERSION) { // multi-version fdbcli only available after 7.0 fprintf(stderr, "ERROR: api version %s is not supported. (Min: 700, Max: %d)\n", args.OptionArg(), - FDB_API_VERSION); + ApiVersion::LATEST_VERSION); return 1; } break; @@ -972,6 +979,10 @@ struct CLIOptions { break; case OPT_NO_HINTS: cliHints = false; + break; + case OPT_USE_FUTURE_PROTOCOL_VERSION: + useFutureProtocolVersion = true; + break; // TLS Options case TLSConfig::OPT_TLS_PLUGIN: @@ -1039,37 +1050,7 @@ Future stopNetworkAfter(Future what) { } } -ACTOR Future addInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv) { - wait(connectLock->take()); - state FlowLock::Releaser releaser(*connectLock); - state ClientWorkerInterface workerInterf = - BinaryReader::fromStringRef(kv.value, IncludeVersion()); - state ClientLeaderRegInterface leaderInterf(workerInterf.address()); - choose { - when(Optional rep = - wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) { - StringRef ip_port = - (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); - (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); - - if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { - Key full_ip_port2 = - StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) - ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) - : full_ip_port2; - (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); - } - } - when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {} - } - return Void(); -} - -ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { +ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise, Reference ccf) { state LineNoise& linenoise = *plinenoise; state bool intrans = false; @@ -1094,20 +1075,6 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { state FdbOptions* options = &globalOptions; - state Reference ccf; - - state std::pair resolvedClusterFile = - ClusterConnectionFile::lookupClusterFileName(opt.clusterFile); - try { - ccf = makeReference(resolvedClusterFile.first); - } catch (Error& e) { - if (e.code() == error_code_operation_cancelled) { - throw; - } - fprintf(stderr, "%s\n", ClusterConnectionFile::getErrorString(resolvedClusterFile, e).c_str()); - return 1; - } - // Ordinarily, this is done when the network is run. However, network thread should be set before TraceEvents are // logged. This thread will eventually run the network, so call it now. TraceEvent::setNetworkThread(); @@ -1361,13 +1328,10 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "fileconfigure")) { - if (tokens.size() == 2 || (tokens.size() == 3 && (tokens[1] == LiteralStringRef("new") || - tokens[1] == LiteralStringRef("FORCE")))) { - bool _result = - wait(makeInterruptable(fileConfigureCommandActor(db, - tokens.back().toString(), - tokens[1] == LiteralStringRef("new"), - tokens[1] == LiteralStringRef("FORCE")))); + if (tokens.size() == 2 || + (tokens.size() == 3 && (tokens[1] == "new"_sr || tokens[1] == "FORCE"_sr))) { + bool _result = wait(makeInterruptable(fileConfigureCommandActor( + db, tokens.back().toString(), tokens[1] == "new"_sr, tokens[1] == "FORCE"_sr))); if (!_result) is_error = true; } else { @@ -1426,6 +1390,13 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "blobkey")) { + bool _result = wait(makeInterruptable(blobKeyCommandActor(localDb, tenantEntry, tokens))); + if (!_result) + is_error = true; + continue; + } + if (tokencmp(tokens[0], "unlock")) { if ((tokens.size() != 2) || (tokens[1].size() != 32) || !std::all_of(tokens[1].begin(), tokens[1].end(), &isxdigit)) { @@ -1615,6 +1586,13 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "consistencyscan")) { + bool _result = wait(makeInterruptable(consistencyScanCommandActor(localDb, tokens))); + if (!_result) + is_error = true; + continue; + } + if (tokencmp(tokens[0], "profile")) { getTransaction(db, managementTenant, tr, options, intrans); bool _result = wait(makeInterruptable(profileCommandActor(localDb, tr, tokens, intrans))); @@ -1907,41 +1885,39 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } - if (tokencmp(tokens[0], "createtenant")) { - bool _result = wait(makeInterruptable(createTenantCommandActor(db, tokens))); - if (!_result) + if (tokencmp(tokens[0], "tenant")) { + bool _result = wait(makeInterruptable(tenantCommand(db, tokens))); + if (!_result) { is_error = true; - continue; - } - - if (tokencmp(tokens[0], "deletetenant")) { - bool _result = wait(makeInterruptable(deleteTenantCommandActor(db, tokens))); - if (!_result) - is_error = true; - else if (tenantName.present() && tokens[1] == tenantName.get()) { + } else if (tokens.size() >= 3 && tenantName.present() && tokencmp(tokens[1], "delete") && + tokens[2] == tenantName.get()) { printAtCol("WARNING: the active tenant was deleted. Use the `usetenant' or `defaulttenant' " "command to choose a new tenant.\n", 80); } + continue; } - if (tokencmp(tokens[0], "listtenants")) { - bool _result = wait(makeInterruptable(listTenantsCommandActor(db, tokens))); + if (tokencmp(tokens[0], "createtenant") || tokencmp(tokens[0], "deletetenant") || + tokencmp(tokens[0], "listtenants") || tokencmp(tokens[0], "gettenant") || + tokencmp(tokens[0], "configuretenant") || tokencmp(tokens[0], "renametenant")) { + bool _result = wait(makeInterruptable(tenantCommandForwarder(db, tokens))); + if (!_result) { + is_error = true; + } + continue; + } + + if (tokencmp(tokens[0], "tenantgroup")) { + bool _result = wait(makeInterruptable(tenantGroupCommand(db, tokens))); if (!_result) is_error = true; continue; } - if (tokencmp(tokens[0], "gettenant")) { - bool _result = wait(makeInterruptable(getTenantCommandActor(db, tokens, opt.apiVersion))); - if (!_result) - is_error = true; - continue; - } - - if (tokencmp(tokens[0], "renametenant")) { - bool _result = wait(makeInterruptable(renameTenantCommandActor(db, tokens))); + if (tokencmp(tokens[0], "metacluster")) { + bool _result = wait(makeInterruptable(metaclusterCommand(db, tokens))); if (!_result) is_error = true; continue; @@ -1980,7 +1956,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } } -ACTOR Future runCli(CLIOptions opt) { +ACTOR Future runCli(CLIOptions opt, Reference ccf) { state LineNoise linenoise( [](std::string const& line, std::vector& completions) { fdbcliCompCmd(line, completions); }, [enabled = opt.cliHints](std::string const& line) -> LineNoise::Hint { @@ -2044,7 +2020,7 @@ ACTOR Future runCli(CLIOptions opt) { .GetLastError(); } - state int result = wait(cli(opt, &linenoise)); + state int result = wait(cli(opt, &linenoise, ccf)); if (!historyFilename.empty()) { try { @@ -2066,6 +2042,31 @@ ACTOR Future timeExit(double duration) { return Void(); } +const char* checkTlsConfigAgainstCoordAddrs(const ClusterConnectionString& ccs) { + // Resolve TLS config and inspect whether any of the certificate, key, ca bytes has been set + extern TLSConfig tlsConfig; + auto const loaded = tlsConfig.loadSync(); + const bool tlsConfigured = + !loaded.getCertificateBytes().empty() || !loaded.getKeyBytes().empty() || !loaded.getCABytes().empty(); + int tlsAddrs = 0; + int totalAddrs = 0; + for (const auto& addr : ccs.coords) { + if (addr.isTLS()) + tlsAddrs++; + totalAddrs++; + } + for (const auto& host : ccs.hostnames) { + if (host.isTLS) + tlsAddrs++; + totalAddrs++; + } + if (!tlsConfigured && tlsAddrs == totalAddrs) { + return "fdbcli is not configured with TLS, but all of the coordinators have TLS addresses."; + } else { + return nullptr; + } +} + int main(int argc, char** argv) { platformInit(); Error::init(); @@ -2170,15 +2171,37 @@ int main(int argc, char** argv) { return 0; } + Reference ccf; + std::pair resolvedClusterFile = ClusterConnectionFile::lookupClusterFileName(opt.clusterFile); + + try { + ccf = makeReference(resolvedClusterFile.first); + } catch (Error& e) { + if (e.code() == error_code_operation_cancelled) { + throw; + } + fprintf(stderr, "%s\n", ClusterConnectionFile::getErrorString(resolvedClusterFile, e).c_str()); + return 1; + } + + // Make sure that TLS configuration lines up with ":tls" prefix on coordinator addresses + if (auto errorMsg = checkTlsConfigAgainstCoordAddrs(ccf->getConnectionString())) { + fprintf(stderr, "ERROR: %s\n", errorMsg); + return 1; + } + try { API->selectApiVersion(opt.apiVersion); + if (opt.useFutureProtocolVersion) { + API->useFutureProtocolVersion(); + } API->setupNetwork(); opt.setupKnobs(); if (opt.exit_code != -1) { return opt.exit_code; } Future memoryUsageMonitor = startMemoryUsageMonitor(opt.memLimit); - Future cliFuture = runCli(opt); + Future cliFuture = runCli(opt, ccf); Future timeoutFuture = opt.exit_timeout ? timeExit(opt.exit_timeout) : Never(); auto f = stopNetworkAfter(success(cliFuture) || timeoutFuture); API->runNetwork(); diff --git a/fdbcli/include/fdbcli/fdbcli.actor.h b/fdbcli/include/fdbcli/fdbcli.actor.h index ba754279e8..dce68eb10b 100644 --- a/fdbcli/include/fdbcli/fdbcli.actor.h +++ b/fdbcli/include/fdbcli/fdbcli.actor.h @@ -95,6 +95,7 @@ extern const KeyRef advanceVersionSpecialKey; extern const KeyRef consistencyCheckSpecialKey; // coordinators extern const KeyRef clusterDescriptionSpecialKey; +extern const KeyRef configDBSpecialKey; extern const KeyRef coordinatorsAutoSpecialKey; extern const KeyRef coordinatorsProcessSpecialKey; // datadistribution @@ -119,7 +120,8 @@ extern const KeyRef ignoreSSFailureSpecialKey; extern const KeyRangeRef processClassSourceSpecialKeyRange; extern const KeyRangeRef processClassTypeSpecialKeyRange; // Other special keys -inline const KeyRef errorMsgSpecialKey = LiteralStringRef("\xff\xff/error_message"); +inline const KeyRef errorMsgSpecialKey = "\xff\xff/error_message"_sr; +inline const KeyRef workerInterfacesVerifyOptionSpecialKey = "\xff\xff/management/options/worker_interfaces/verify"_sr; // help functions (Copied from fdbcli.actor.cpp) // get all workers' info @@ -132,13 +134,14 @@ void printUsage(StringRef command); // Pre: tr failed with special_keys_api_failure error // Read the error message special key and return the message ACTOR Future getSpecialKeysFailureErrorMessage(Reference tr); -// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces +// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces. +// A worker list will be returned from CC. +// If verify, we will try to establish connections to all workers returned. +// In particular, it will deserialize \xff\xff/worker_interfaces/
:= kv pairs and issue RPC +// calls, then only return interfaces(kv pairs) the client can talk to ACTOR Future getWorkerInterfaces(Reference tr, - std::map>* address_interface); -// Deserialize \xff\xff/worker_interfaces/
:= k-v pair and verify by a RPC call -ACTOR Future verifyAndAddInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv); + std::map>* address_interface, + bool verify = false); // print cluster status info void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level, @@ -161,14 +164,12 @@ ACTOR Future configureCommandActor(Reference db, ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens, bool intrans); +// consistency scan command +ACTOR Future consistencyScanCommandActor(Database localDb, std::vector tokens); // coordinators command ACTOR Future coordinatorsCommandActor(Reference db, std::vector tokens); -// createtenant command -ACTOR Future createTenantCommandActor(Reference db, std::vector tokens); // datadistribution command ACTOR Future dataDistributionCommandActor(Reference db, std::vector tokens); -// deletetenant command -ACTOR Future deleteTenantCommandActor(Reference db, std::vector tokens); // exclude command ACTOR Future excludeCommandActor(Reference db, std::vector tokens, Future warn); // expensive_data_check command @@ -184,8 +185,6 @@ ACTOR Future fileConfigureCommandActor(Reference db, bool force); // force_recovery_with_data_loss command ACTOR Future forceRecoveryWithDataLossCommandActor(Reference db, std::vector tokens); -// gettenant command -ACTOR Future getTenantCommandActor(Reference db, std::vector tokens, int apiVersion); // include command ACTOR Future includeCommandActor(Reference db, std::vector tokens); // kill command @@ -193,11 +192,13 @@ ACTOR Future killCommandActor(Reference db, Reference tr, std::vector tokens, std::map>* address_interface); -// listtenants command -ACTOR Future listTenantsCommandActor(Reference db, std::vector tokens); // lock/unlock command ACTOR Future lockCommandActor(Reference db, std::vector tokens); ACTOR Future unlockDatabaseActor(Reference db, UID uid); + +// metacluster command +Future metaclusterCommand(Reference db, std::vector tokens); + // changefeed command ACTOR Future changeFeedCommandActor(Database localDb, Optional tenantEntry, @@ -207,6 +208,11 @@ ACTOR Future changeFeedCommandActor(Database localDb, ACTOR Future blobRangeCommandActor(Database localDb, Optional tenantEntry, std::vector tokens); + +// blobkey command +ACTOR Future blobKeyCommandActor(Database localDb, + Optional tenantEntry, + std::vector tokens); // maintenance command ACTOR Future setHealthyZone(Reference db, StringRef zoneId, double seconds, bool printWarning = false); ACTOR Future clearHealthyZone(Reference db, @@ -218,8 +224,6 @@ ACTOR Future profileCommandActor(Database db, Reference tr, std::vector tokens, bool intrans); -// renametenant command -ACTOR Future renameTenantCommandActor(Reference db, std::vector tokens); // quota command ACTOR Future quotaCommandActor(Reference db, std::vector tokens); // setclass command @@ -236,6 +240,12 @@ ACTOR Future suspendCommandActor(Reference db, Reference tr, std::vector tokens, std::map>* address_interface); +// tenant command +Future tenantCommand(Reference db, std::vector tokens); +// tenant command compatibility layer +Future tenantCommandForwarder(Reference db, std::vector tokens); +// tenantgroup command +Future tenantGroupCommand(Reference db, std::vector tokens); // throttle command ACTOR Future throttleCommandActor(Reference db, std::vector tokens); // triggerteaminfolog command diff --git a/bindings/python/tests/fdbcli_tests.py b/fdbcli/tests/fdbcli_tests.py similarity index 74% rename from bindings/python/tests/fdbcli_tests.py rename to fdbcli/tests/fdbcli_tests.py index 386916f5c6..530c80f865 100755 --- a/bindings/python/tests/fdbcli_tests.py +++ b/fdbcli/tests/fdbcli_tests.py @@ -7,6 +7,7 @@ import subprocess import logging import functools import json +import tempfile import time import random from argparse import ArgumentParser, RawDescriptionHelpFormatter @@ -561,6 +562,7 @@ def profile(logger): assert output2 == default_profile_client_get_output # set rate and size limit run_fdbcli_command('profile', 'client', 'set', '0.5', '1GB') + time.sleep(1) # global config can take some time to sync output3 = run_fdbcli_command('profile', 'client', 'get') logger.debug(output3) output3_list = output3.split(' ') @@ -569,6 +571,7 @@ def profile(logger): assert output3_list[-1] == '1000000000.' # change back to default value and check run_fdbcli_command('profile', 'client', 'set', 'default', 'default') + time.sleep(1) # global config can take some time to sync assert run_fdbcli_command('profile', 'client', 'get') == default_profile_client_get_output @@ -590,47 +593,191 @@ def triggerddteaminfolog(logger): output = run_fdbcli_command('triggerddteaminfolog') assert output == 'Triggered team info logging in data distribution.' +def setup_tenants(tenants): + command = '; '.join(['tenant create %s' % t for t in tenants]) + run_fdbcli_command(command) + +def clear_database_and_tenants(): + run_fdbcli_command('writemode on; option on SPECIAL_KEY_SPACE_ENABLE_WRITES; clearrange "" \\xff; clearrange \\xff\\xff/management/tenant/map/ \\xff\\xff/management/tenant/map0') + +def run_tenant_test(test_func): + test_func() + clear_database_and_tenants() @enable_logging() -def tenants(logger): - output = run_fdbcli_command('listtenants') - assert output == 'The cluster has no tenants' +def tenant_create(logger): + output1 = run_fdbcli_command('tenant create tenant') + assert output1 == 'The tenant `tenant\' has been created' - output = run_fdbcli_command('createtenant tenant') - assert output == 'The tenant `tenant\' has been created' - - output = run_fdbcli_command('createtenant tenant2') + output = run_fdbcli_command('tenant create tenant2 tenant_group=tenant_group2') assert output == 'The tenant `tenant2\' has been created' - output = run_fdbcli_command('listtenants') + output = run_fdbcli_command_and_get_error('tenant create tenant') + assert output == 'ERROR: A tenant with the given name already exists (2132)' + +@enable_logging() +def tenant_delete(logger): + setup_tenants(['tenant', 'tenant2']) + run_fdbcli_command('writemode on; usetenant tenant2; set tenant_test value') + + # delete a tenant while the fdbcli is using that tenant + process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) + cmd_sequence = ['writemode on', 'usetenant tenant', 'tenant delete tenant', 'get tenant_test', 'defaulttenant', 'usetenant tenant'] + output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) + + lines = output.decode().strip().split('\n')[-6:] + error_lines = error_output.decode().strip().split('\n')[-2:] + assert lines[0] == 'Using tenant `tenant\'' + assert lines[1] == 'The tenant `tenant\' has been deleted' + assert lines[2] == 'WARNING: the active tenant was deleted. Use the `usetenant\' or `defaulttenant\'' + assert lines[3] == 'command to choose a new tenant.' + assert error_lines[0] == 'ERROR: Tenant does not exist (2131)' + assert lines[5] == 'Using the default tenant' + assert error_lines[1] == 'ERROR: Tenant `tenant\' does not exist' + + # delete a non-empty tenant + process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) + cmd_sequence = ['writemode on', 'tenant delete tenant2', 'usetenant tenant2', 'clear tenant_test', 'defaulttenant', 'tenant delete tenant2'] + output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) + + lines = output.decode().strip().split('\n')[-4:] + error_lines = error_output.decode().strip().split('\n')[-1:] + assert error_lines[0] == 'ERROR: Cannot delete a non-empty tenant (2133)' + assert lines[0] == 'Using tenant `tenant2\'' + assert lines[1].startswith('Committed') + assert lines[2] == 'Using the default tenant' + assert lines[3] == 'The tenant `tenant2\' has been deleted' + + # delete a non-existing tenant + output = run_fdbcli_command_and_get_error('tenant delete tenant') + assert output == 'ERROR: Tenant does not exist (2131)' + +@enable_logging() +def tenant_list(logger): + output = run_fdbcli_command('tenant list') + assert output == 'The cluster has no tenants' + + setup_tenants(['tenant', 'tenant2']) + + output = run_fdbcli_command('tenant list') assert output == '1. tenant\n 2. tenant2' - output = run_fdbcli_command('listtenants a z 1') + output = run_fdbcli_command('tenant list a z 1') assert output == '1. tenant' - output = run_fdbcli_command('listtenants a tenant2') + output = run_fdbcli_command('tenant list a tenant2') assert output == '1. tenant' - output = run_fdbcli_command('listtenants tenant2 z') + output = run_fdbcli_command('tenant list tenant2 z') assert output == '1. tenant2' - output = run_fdbcli_command('gettenant tenant') + output = run_fdbcli_command('tenant list a b') + assert output == 'The cluster has no tenants in the specified range' + + output = run_fdbcli_command_and_get_error('tenant list b a') + assert output == 'ERROR: end must be larger than begin' + + output = run_fdbcli_command_and_get_error('tenant list a b 12x') + assert output == 'ERROR: invalid limit `12x\'' + +@enable_logging() +def tenant_get(logger): + setup_tenants(['tenant', 'tenant2 tenant_group=tenant_group2']) + + output = run_fdbcli_command('tenant get tenant') lines = output.split('\n') - assert len(lines) == 2 + assert len(lines) == 3 assert lines[0].strip().startswith('id: ') assert lines[1].strip().startswith('prefix: ') + assert lines[2].strip() == 'tenant state: ready' - output = run_fdbcli_command('gettenant tenant JSON') + output = run_fdbcli_command('tenant get tenant JSON') json_output = json.loads(output, strict=False) assert(len(json_output) == 2) assert('tenant' in json_output) assert(json_output['type'] == 'success') - assert(len(json_output['tenant']) == 2) + assert(len(json_output['tenant']) == 4) assert('id' in json_output['tenant']) + assert('encrypted' in json_output['tenant']) assert('prefix' in json_output['tenant']) assert(len(json_output['tenant']['prefix']) == 2) assert('base64' in json_output['tenant']['prefix']) assert('printable' in json_output['tenant']['prefix']) + assert(json_output['tenant']['tenant_state'] == 'ready') + + output = run_fdbcli_command('tenant get tenant2') + lines = output.split('\n') + assert len(lines) == 4 + assert lines[0].strip().startswith('id: ') + assert lines[1].strip().startswith('prefix: ') + assert lines[2].strip() == 'tenant state: ready' + assert lines[3].strip() == 'tenant group: tenant_group2' + + output = run_fdbcli_command('tenant get tenant2 JSON') + json_output = json.loads(output, strict=False) + assert(len(json_output) == 2) + assert('tenant' in json_output) + assert(json_output['type'] == 'success') + assert(len(json_output['tenant']) == 5) + assert('id' in json_output['tenant']) + assert('encrypted' in json_output['tenant']) + assert('prefix' in json_output['tenant']) + assert(json_output['tenant']['tenant_state'] == 'ready') + assert('tenant_group' in json_output['tenant']) + assert(len(json_output['tenant']['tenant_group']) == 2) + assert('base64' in json_output['tenant']['tenant_group']) + assert(json_output['tenant']['tenant_group']['printable'] == 'tenant_group2') + +@enable_logging() +def tenant_configure(logger): + setup_tenants(['tenant']) + + output = run_fdbcli_command('tenant configure tenant tenant_group=tenant_group1') + assert output == 'The configuration for tenant `tenant\' has been updated' + + output = run_fdbcli_command('tenant get tenant') + lines = output.split('\n') + assert len(lines) == 4 + assert lines[3].strip() == 'tenant group: tenant_group1' + + output = run_fdbcli_command('tenant configure tenant unset tenant_group') + assert output == 'The configuration for tenant `tenant\' has been updated' + + output = run_fdbcli_command('tenant get tenant') + lines = output.split('\n') + assert len(lines) == 3 + + output = run_fdbcli_command_and_get_error('tenant configure tenant tenant_group=tenant_group1 tenant_group=tenant_group2') + assert output == 'ERROR: configuration parameter `tenant_group\' specified more than once.' + + output = run_fdbcli_command_and_get_error('tenant configure tenant unset') + assert output == 'ERROR: `unset\' specified without a configuration parameter.' + + output = run_fdbcli_command_and_get_error('tenant configure tenant unset tenant_group=tenant_group1') + assert output == 'ERROR: unrecognized configuration parameter `tenant_group=tenant_group1\'.' + + output = run_fdbcli_command_and_get_error('tenant configure tenant tenant_group') + assert output == 'ERROR: invalid configuration string `tenant_group\'. String must specify a value using `=\'.' + + output = run_fdbcli_command_and_get_error('tenant configure tenant3 tenant_group=tenant_group1') + assert output == 'ERROR: Tenant does not exist (2131)' + +@enable_logging() +def tenant_rename(logger): + setup_tenants(['tenant', 'tenant2']) + + output = run_fdbcli_command('tenant rename tenant tenant3') + assert output == 'The tenant `tenant\' has been renamed to `tenant3\'' + + output = run_fdbcli_command_and_get_error('tenant rename tenant tenant4') + assert output == 'ERROR: Tenant does not exist (2131)' + + output = run_fdbcli_command_and_get_error('tenant rename tenant2 tenant3') + assert output == 'ERROR: A tenant with the given name already exists (2132)' + +@enable_logging() +def tenant_usetenant(logger): + setup_tenants(['tenant', 'tenant2']) output = run_fdbcli_command('usetenant') assert output == 'Using the default tenant' @@ -663,44 +810,103 @@ def tenants(logger): assert lines[3] == '`tenant_test\' is `tenant2\'' process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=fdbcli_env) - cmd_sequence = ['usetenant tenant', 'get tenant_test', 'defaulttenant', 'get tenant_test'] + cmd_sequence = ['usetenant tenant', 'get tenant_test', 'usetenant tenant2', 'get tenant_test', 'defaulttenant', 'get tenant_test'] output, _ = process.communicate(input='\n'.join(cmd_sequence).encode()) - lines = output.decode().strip().split('\n')[-4:] + lines = output.decode().strip().split('\n')[-6:] assert lines[0] == 'Using tenant `tenant\'' assert lines[1] == '`tenant_test\' is `tenant\'' - assert lines[2] == 'Using the default tenant' - assert lines[3] == '`tenant_test\' is `default_tenant\'' + assert lines[2] == 'Using tenant `tenant2\'' + assert lines[3] == '`tenant_test\' is `tenant2\'' + assert lines[4] == 'Using the default tenant' + assert lines[5] == '`tenant_test\' is `default_tenant\'' - process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) - cmd_sequence = ['writemode on', 'usetenant tenant', 'clear tenant_test', - 'deletetenant tenant', 'get tenant_test', 'defaulttenant', 'usetenant tenant'] - output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) +@enable_logging() +def tenant_old_commands(logger): + create_output = run_fdbcli_command('tenant create tenant') + list_output = run_fdbcli_command('tenant list') + get_output = run_fdbcli_command('tenant get tenant') + # Run the gettenant command here because the ID will be different in the second block + get_output_old = run_fdbcli_command('gettenant tenant') + configure_output = run_fdbcli_command('tenant configure tenant tenant_group=tenant_group1') + rename_output = run_fdbcli_command('tenant rename tenant tenant2') + delete_output = run_fdbcli_command('tenant delete tenant2') - lines = output.decode().strip().split('\n')[-7:] - error_lines = error_output.decode().strip().split('\n')[-2:] - assert lines[0] == 'Using tenant `tenant\'' - assert lines[1].startswith('Committed') - assert lines[2] == 'The tenant `tenant\' has been deleted' - assert lines[3] == 'WARNING: the active tenant was deleted. Use the `usetenant\' or `defaulttenant\'' - assert lines[4] == 'command to choose a new tenant.' - assert error_lines[0] == 'ERROR: Tenant does not exist (2131)' - assert lines[6] == 'Using the default tenant' - assert error_lines[1] == 'ERROR: Tenant `tenant\' does not exist' + create_output_old = run_fdbcli_command('createtenant tenant') + list_output_old = run_fdbcli_command('listtenants') + configure_output_old = run_fdbcli_command('configuretenant tenant tenant_group=tenant_group1') + rename_output_old = run_fdbcli_command('renametenant tenant tenant2') + delete_output_old = run_fdbcli_command('deletetenant tenant2') - process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) - cmd_sequence = ['writemode on', 'deletetenant tenant2', 'usetenant tenant2', 'clear tenant_test', 'defaulttenant', 'deletetenant tenant2'] - output, error_output = process.communicate(input='\n'.join(cmd_sequence).encode()) + assert create_output == create_output_old + assert list_output == list_output_old + assert get_output == get_output_old + assert configure_output == configure_output_old + assert rename_output == rename_output_old + assert delete_output == delete_output_old - lines = output.decode().strip().split('\n')[-4:] - error_lines = error_output.decode().strip().split('\n')[-1:] - assert error_lines[0] == 'ERROR: Cannot delete a non-empty tenant (2133)' - assert lines[0] == 'Using tenant `tenant2\'' - assert lines[1].startswith('Committed') - assert lines[2] == 'Using the default tenant' - assert lines[3] == 'The tenant `tenant2\' has been deleted' +@enable_logging() +def tenant_group_list(logger): + output = run_fdbcli_command('tenantgroup list') + assert output == 'The cluster has no tenant groups' - run_fdbcli_command('writemode on; clear tenant_test') + setup_tenants(['tenant', 'tenant2 tenant_group=tenant_group2', 'tenant3 tenant_group=tenant_group3']) + + output = run_fdbcli_command('tenantgroup list') + assert output == '1. tenant_group2\n 2. tenant_group3' + + output = run_fdbcli_command('tenantgroup list a z 1') + assert output == '1. tenant_group2' + + output = run_fdbcli_command('tenantgroup list a tenant_group3') + assert output == '1. tenant_group2' + + output = run_fdbcli_command('tenantgroup list tenant_group3 z') + assert output == '1. tenant_group3' + + output = run_fdbcli_command('tenantgroup list a b') + assert output == 'The cluster has no tenant groups in the specified range' + + output = run_fdbcli_command_and_get_error('tenantgroup list b a') + assert output == 'ERROR: end must be larger than begin' + + output = run_fdbcli_command_and_get_error('tenantgroup list a b 12x') + assert output == 'ERROR: invalid limit `12x\'' + +@enable_logging() +def tenant_group_get(logger): + setup_tenants(['tenant tenant_group=tenant_group']) + + output = run_fdbcli_command('tenantgroup get tenant_group') + assert output == 'The tenant group is present in the cluster' + + output = run_fdbcli_command('tenantgroup get tenant_group JSON') + json_output = json.loads(output, strict=False) + assert(len(json_output) == 2) + assert('tenant_group' in json_output) + assert(json_output['type'] == 'success') + assert(len(json_output['tenant_group']) == 0) + + output = run_fdbcli_command_and_get_error('tenantgroup get tenant_group2') + assert output == 'ERROR: tenant group not found' + + output = run_fdbcli_command('tenantgroup get tenant_group2 JSON') + json_output = json.loads(output, strict=False) + assert(len(json_output) == 2) + assert(json_output['type'] == 'error') + assert(json_output['error'] == 'tenant group not found') + +def tenants(): + run_tenant_test(tenant_create) + run_tenant_test(tenant_delete) + run_tenant_test(tenant_list) + run_tenant_test(tenant_get) + run_tenant_test(tenant_configure) + run_tenant_test(tenant_rename) + run_tenant_test(tenant_usetenant) + run_tenant_test(tenant_old_commands) + run_tenant_test(tenant_group_list) + run_tenant_test(tenant_group_get) def integer_options(): process = subprocess.Popen(command_template[:-1], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=fdbcli_env) @@ -712,6 +918,24 @@ def integer_options(): assert lines[1].startswith('Committed') assert error_output == b'' +def tls_address_suffix(): + # fdbcli shall prevent a non-TLS fdbcli run from connecting to an all-TLS cluster + preamble = 'eNW1yf1M:eNW1yf1M@' + num_server_addrs = [1, 2, 5] + err_output_server_tls = "ERROR: fdbcli is not configured with TLS, but all of the coordinators have TLS addresses." + + with tempfile.TemporaryDirectory() as tmpdir: + cluster_fn = tmpdir + "/fdb.cluster" + for num_server_addr in num_server_addrs: + with open(cluster_fn, "w") as fp: + fp.write(preamble + ",".join( + ["127.0.0.1:{}:tls".format(4000 + addr_idx) for addr_idx in range(num_server_addr)])) + fp.close() + fdbcli_process = subprocess.run(command_template[:2] + [cluster_fn], capture_output=True) + assert fdbcli_process.returncode != 0 + err_out = fdbcli_process.stderr.decode("utf8").strip() + assert err_out == err_output_server_tls, f"unexpected output: {err_out}" + if __name__ == '__main__': parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter, description=""" @@ -758,6 +982,7 @@ if __name__ == '__main__': tenants() versionepoch() integer_options() + tls_address_suffix() else: assert args.process_number > 1, "Process number should be positive" coordinators() diff --git a/fdbclient/Atomic.cpp b/fdbclient/Atomic.cpp new file mode 100644 index 0000000000..f2614e3881 --- /dev/null +++ b/fdbclient/Atomic.cpp @@ -0,0 +1,47 @@ +/* + * Atomic.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/Atomic.h" +#include "flow/Arena.h" +#include "flow/UnitTest.h" + +void forceLinkAtomicTests() {} + +TEST_CASE("/Atomic/DoAppendIfFits") { + Arena arena; + { + Value existingValue = ValueRef(arena, "existing"_sr); + Value otherOperand = ValueRef(arena, "other"_sr); + auto result = doAppendIfFits(existingValue, otherOperand, arena); + ASSERT(compare("existingother"_sr, result) == 0); + } + { + Value existingValue = makeString(CLIENT_KNOBS->VALUE_SIZE_LIMIT - 1, arena); + Value otherOperand = makeString(2, arena); + deterministicRandom()->randomBytes(mutateString(existingValue), existingValue.size()); + deterministicRandom()->randomBytes(mutateString(otherOperand), otherOperand.size()); + // Appended values cannot fit in result, should return existingValue + auto result = doAppendIfFits(existingValue, otherOperand, arena); + ASSERT(compare(existingValue, result) == 0); + } + return Void(); +} + +// TODO: Add more unit tests for atomic operations defined in Atomic.h diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index cb999a6e12..ab136878c4 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -22,6 +22,10 @@ #include #include "fdbclient/BackupAgent.actor.h" +#include "fdbclient/BlobCipher.h" +#include "fdbclient/GetEncryptCipherKeys.actor.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/Metacluster.h" #include "fdbrpc/simulator.h" #include "flow/ActorCollection.h" #include "flow/actorcompiler.h" // has to be last include @@ -253,16 +257,18 @@ std::pair decodeBKMutationLogKey(Key key) { bigEndian32(*(int32_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t) + sizeof(int64_t)))); } -void decodeBackupLogValue(Arena& arena, - VectorRef& result, - int& mutationSize, - StringRef value, - StringRef addPrefix, - StringRef removePrefix, - Version version, - Reference> key_version) { +ACTOR static Future decodeBackupLogValue(Arena* arena, + VectorRef* result, + VectorRef>* encryptedResult, + int* mutationSize, + Standalone value, + Key addPrefix, + Key removePrefix, + Version version, + Reference> key_version, + Database cx) { try { - uint64_t offset(0); + state uint64_t offset(0); uint64_t protocolVersion = 0; memcpy(&protocolVersion, value.begin(), sizeof(uint64_t)); offset += sizeof(uint64_t); @@ -274,36 +280,48 @@ void decodeBackupLogValue(Arena& arena, throw incompatible_protocol_version(); } - uint32_t totalBytes = 0; + state uint32_t totalBytes = 0; memcpy(&totalBytes, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); - uint32_t consumed = 0; + state uint32_t consumed = 0; if (totalBytes + offset > value.size()) throw restore_missing_data(); - int originalOffset = offset; + state int originalOffset = offset; while (consumed < totalBytes) { uint32_t type = 0; memcpy(&type, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); - uint32_t len1 = 0; + state uint32_t len1 = 0; memcpy(&len1, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); - uint32_t len2 = 0; + state uint32_t len2 = 0; memcpy(&len2, value.begin() + offset, sizeof(uint32_t)); offset += sizeof(uint32_t); ASSERT(offset + len1 + len2 <= value.size() && isValidMutationType(type)); - MutationRef logValue; - Arena tempArena; + state MutationRef logValue; + state Arena tempArena; logValue.type = type; logValue.param1 = value.substr(offset, len1); offset += len1; logValue.param2 = value.substr(offset, len2); offset += len2; + state Optional encryptedLogValue = Optional(); + + // Decrypt mutation ref if encrypted + if (logValue.isEncrypted()) { + encryptedLogValue = logValue; + Reference const> dbInfo = cx->clientInfo; + TextAndHeaderCipherKeys cipherKeys = + wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::BACKUP)); + logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP); + } + ASSERT(!logValue.isEncrypted()); + MutationRef originalLogValue = logValue; if (logValue.type == MutationRef::ClearRange) { KeyRangeRef range(logValue.param1, logValue.param2); @@ -311,7 +329,7 @@ void decodeBackupLogValue(Arena& arena, for (auto r : ranges) { if (version > r.value() && r.value() != invalidVersion) { KeyRef minKey = std::min(r.range().end, range.end); - if (minKey == (removePrefix == StringRef() ? normalKeys.end : strinc(removePrefix))) { + if (minKey == (removePrefix == StringRef() ? allKeys.end : strinc(removePrefix))) { logValue.param1 = std::max(r.range().begin, range.begin); if (removePrefix.size()) { logValue.param1 = logValue.param1.removePrefix(removePrefix); @@ -319,9 +337,9 @@ void decodeBackupLogValue(Arena& arena, if (addPrefix.size()) { logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); } - logValue.param2 = addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix, tempArena); - result.push_back_deep(arena, logValue); - mutationSize += logValue.expectedSize(); + logValue.param2 = addPrefix == StringRef() ? allKeys.end : strinc(addPrefix, tempArena); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); } else { logValue.param1 = std::max(r.range().begin, range.begin); logValue.param2 = minKey; @@ -333,8 +351,13 @@ void decodeBackupLogValue(Arena& arena, logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); logValue.param2 = logValue.param2.withPrefix(addPrefix, tempArena); } - result.push_back_deep(arena, logValue); - mutationSize += logValue.expectedSize(); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); + } + if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) { + encryptedResult->push_back_deep(*arena, encryptedLogValue); + } else { + encryptedResult->push_back_deep(*arena, Optional()); } } } @@ -348,8 +371,15 @@ void decodeBackupLogValue(Arena& arena, if (addPrefix.size()) { logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); } - result.push_back_deep(arena, logValue); - mutationSize += logValue.expectedSize(); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); + // If we did not remove/add prefixes to the mutation then keep the original encrypted mutation so we + // do not have to re-encrypt unnecessarily + if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) { + encryptedResult->push_back_deep(*arena, encryptedLogValue); + } else { + encryptedResult->push_back_deep(*arena, Optional()); + } } } @@ -374,6 +404,7 @@ void decodeBackupLogValue(Arena& arena, .detail("Value", value); throw; } + return Void(); } static double lastErrorTime = 0; @@ -414,7 +445,7 @@ ACTOR Future readCommitted(Database cx, loop { try { state GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, - (g_network->isSimulated() && !g_simulator.speedUpSimulation) + (g_network->isSimulated() && !g_simulator->speedUpSimulation) ? CLIENT_KNOBS->BACKUP_SIMULATED_LIMIT_BYTES : CLIENT_KNOBS->BACKUP_GET_RANGE_LIMIT_BYTES); @@ -493,7 +524,7 @@ ACTOR Future readCommitted(Database cx, loop { try { state GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, - (g_network->isSimulated() && !g_simulator.speedUpSimulation) + (g_network->isSimulated() && !g_simulator->speedUpSimulation) ? CLIENT_KNOBS->BACKUP_SIMULATED_LIMIT_BYTES : CLIENT_KNOBS->BACKUP_GET_RANGE_LIMIT_BYTES); @@ -614,21 +645,24 @@ ACTOR Future dumpData(Database cx, state int mutationSize = 0; loop { try { - RCGroup group = waitNext(results.getFuture()); + state RCGroup group = waitNext(results.getFuture()); lock->release(group.items.expectedSize()); BinaryWriter bw(Unversioned()); for (int i = 0; i < group.items.size(); ++i) { bw.serializeBytes(group.items[i].value); } - decodeBackupLogValue(req.arena, - req.transaction.mutations, - mutationSize, - bw.toValue(), - addPrefix, - removePrefix, - group.groupKey, - keyVersion); + Standalone value = bw.toValue(); + wait(decodeBackupLogValue(&req.arena, + &req.transaction.mutations, + &req.transaction.encryptedMutations, + &mutationSize, + value, + addPrefix, + removePrefix, + group.groupKey, + keyVersion, + cx)); newBeginVersion = group.groupKey + 1; if (mutationSize >= CLIENT_KNOBS->BACKUP_LOG_WRITE_BATCH_MAX_SIZE) { break; @@ -652,8 +686,10 @@ ACTOR Future dumpData(Database cx, Key rangeEnd = getApplyKey(newBeginVersion, uid); req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::SetValue, applyBegin, versionKey)); + req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(applyBegin)); req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::ClearRange, rangeBegin, rangeEnd)); + req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(rangeBegin)); // The commit request contains no read conflict ranges, so regardless of what read version we @@ -968,10 +1004,9 @@ ACTOR Future cleanupLogMutations(Database cx, Value destUidValue, bool del .get(BackupAgentBase::keySourceStates) .get(currLogUid) .pack(DatabaseBackupAgent::keyStateStatus)); - state Future> foundBackupKey = - tr->get(Subspace(currLogUid.withPrefix(LiteralStringRef("uid->config/")) - .withPrefix(fileBackupPrefixRange.begin)) - .pack(LiteralStringRef("stateEnum"))); + state Future> foundBackupKey = tr->get( + Subspace(currLogUid.withPrefix("uid->config/"_sr).withPrefix(fileBackupPrefixRange.begin)) + .pack("stateEnum"_sr)); wait(success(foundDRKey) && success(foundBackupKey)); if (foundDRKey.get().present() && foundBackupKey.get().present()) { @@ -1165,3 +1200,38 @@ Standalone BackupAgentBase::getCurrentTime() { } std::string const BackupAgentBase::defaultTagName = "default"; + +void addDefaultBackupRanges(Standalone>& backupKeys) { + backupKeys.push_back_deep(backupKeys.arena(), normalKeys); + + for (auto& r : getSystemBackupRanges()) { + backupKeys.push_back_deep(backupKeys.arena(), r); + } +} + +VectorRef const& getSystemBackupRanges() { + static Standalone> systemBackupRanges; + if (systemBackupRanges.empty()) { + systemBackupRanges.push_back_deep(systemBackupRanges.arena(), prefixRange(TenantMetadata::subspace())); + systemBackupRanges.push_back_deep(systemBackupRanges.arena(), + singleKeyRange(MetaclusterMetadata::metaclusterRegistration().key)); + } + + return systemBackupRanges; +} + +KeyRangeMap const& systemBackupMutationMask() { + static KeyRangeMap mask; + if (mask.size() == 1) { + for (auto r : getSystemBackupRanges()) { + mask.insert(r, true); + } + } + + return mask; +} + +KeyRangeRef const& getDefaultBackupSharedRange() { + static KeyRangeRef defaultSharedRange(""_sr, ""_sr); + return defaultSharedRange; +} diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 9cf2d0c0c3..c318a6591d 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -288,11 +288,46 @@ Reference IBackupContainer::openContainer(const std::string& u #ifdef BUILD_AZURE_BACKUP else if (u.startsWith("azure://"_sr)) { u.eat("azure://"_sr); - auto accountName = u.eat("@"_sr).toString(); - auto endpoint = u.eat("/"_sr).toString(); - auto containerName = u.eat("/"_sr).toString(); - r = makeReference( - endpoint, accountName, containerName, encryptionKeyFileName); + auto address = u.eat("/"_sr); + if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) { + CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint"); + // ..core.windows.net/ + auto endPoint = address.toString(); + auto accountName = address.eat("."_sr).toString(); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endPoint, accountName, containerName, encryptionKeyFileName); + } else { + // resolve the network address if necessary + std::string endpoint(address.toString()); + Optional parsedAddress = NetworkAddress::parseOptional(endpoint); + if (!parsedAddress.present()) { + try { + auto hostname = Hostname::parse(endpoint); + auto resolvedAddress = hostname.resolveBlocking(); + if (resolvedAddress.present()) { + CODE_PROBE(true, "Azure backup url with hostname in the endpoint"); + parsedAddress = resolvedAddress.get(); + } + } catch (Error& e) { + TraceEvent(SevError, "InvalidAzureBackupUrl").error(e).detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + } + if (!parsedAddress.present()) { + TraceEvent(SevError, "InvalidAzureBackupUrl").detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + auto accountName = u.eat("/"_sr).toString(); + // Avoid including ":tls" and "(fromHostname)" + // note: the endpoint needs to contain the account name + // so either ".blob.core.windows.net" or ":/" + endpoint = + fmt::format("{}/{}", formatIpPort(parsedAddress.get().ip, parsedAddress.get().port), accountName); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endpoint, accountName, containerName, encryptionKeyFileName); + } } #endif else { @@ -393,21 +428,21 @@ ACTOR Future timeKeeperVersionFromDatetime(std::string datetime, Databa try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state std::vector> results = + state KeyBackedRangeResult> rangeResult = wait(versionMap.getRange(tr, 0, time, 1, Snapshot::False, Reverse::True)); - if (results.size() != 1) { + if (rangeResult.results.size() != 1) { // No key less than time was found in the database // Look for a key >= time. - wait(store(results, versionMap.getRange(tr, time, std::numeric_limits::max(), 1))); + wait(store(rangeResult, versionMap.getRange(tr, time, std::numeric_limits::max(), 1))); - if (results.size() != 1) { + if (rangeResult.results.size() != 1) { fprintf(stderr, "ERROR: Unable to calculate a version for given date/time.\n"); throw backup_error(); } } // Adjust version found by the delta between time and the time found and min with 0. - auto& result = results[0]; + auto& result = rangeResult.results[0]; return std::max(0, result.second + (time - result.first) * CLIENT_KNOBS->CORE_VERSIONSPERSECOND); } catch (Error& e) { @@ -432,21 +467,21 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference mid = (min + max + 1) / 2; // ceiling // Find the highest time < mid - state std::vector> results = + state KeyBackedRangeResult> rangeResult = wait(versionMap.getRange(tr, min, mid, 1, Snapshot::False, Reverse::True)); - if (results.size() != 1) { + if (rangeResult.results.size() != 1) { if (mid == min) { // There aren't any records having a version < v, so just look for any record having a time < now // and base a result on it - wait(store(results, versionMap.getRange(tr, 0, (int64_t)now(), 1))); + wait(store(rangeResult, versionMap.getRange(tr, 0, (int64_t)now(), 1))); - if (results.size() != 1) { + if (rangeResult.results.size() != 1) { // There aren't any timekeeper records to base a result on so return nothing return Optional(); } - found = results[0]; + found = rangeResult.results[0]; break; } @@ -454,7 +489,7 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference continue; } - found = results[0]; + found = rangeResult.results[0]; if (v < found.second) { max = found.first; diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp index b413bae0c8..5ad037d993 100644 --- a/fdbclient/BackupContainerFileSystem.actor.cpp +++ b/fdbclient/BackupContainerFileSystem.actor.cpp @@ -906,6 +906,7 @@ public: ACTOR static Future> getRestoreSet(Reference bc, Version targetVersion, VectorRef keyRangesFilter, + Optional cx, bool logsOnly = false, Version beginVersion = invalidVersion) { for (const auto& range : keyRangesFilter) { @@ -982,7 +983,7 @@ public: restorable.ranges.end(), [file = rit->first](const RangeFile f) { return f.fileName == file; }); ASSERT(it != restorable.ranges.end()); - KeyRange result = wait(bc->getSnapshotFileKeyRange(*it)); + KeyRange result = wait(bc->getSnapshotFileKeyRange(*it, cx)); ASSERT(rit->second.begin <= result.begin && rit->second.end >= result.end); } } @@ -1349,7 +1350,9 @@ Future BackupContainerFileSystem::expireData(Version expireEndVersion, Reference::addRef(this), expireEndVersion, force, progress, restorableBeginVersion); } -ACTOR static Future getSnapshotFileKeyRange_impl(Reference bc, RangeFile file) { +ACTOR static Future getSnapshotFileKeyRange_impl(Reference bc, + RangeFile file, + Optional cx) { state int readFileRetries = 0; state bool beginKeySet = false; state Key beginKey; @@ -1361,7 +1364,8 @@ ACTOR static Future getSnapshotFileKeyRange_impl(Reference(file.blockSize, file.fileSize - j); - Standalone> blockData = wait(fileBackup::decodeRangeFileBlock(inFile, j, len)); + Standalone> blockData = + wait(fileBackup::decodeRangeFileBlock(inFile, j, len, cx)); if (!beginKeySet) { beginKey = blockData.front().key; beginKeySet = true; @@ -1434,17 +1438,18 @@ ACTOR static Future> readVersionProperty(Reference BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file) { +Future BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file, Optional cx) { ASSERT(g_network->isSimulated()); - return getSnapshotFileKeyRange_impl(Reference::addRef(this), file); + return getSnapshotFileKeyRange_impl(Reference::addRef(this), file, cx); } Future> BackupContainerFileSystem::getRestoreSet(Version targetVersion, + Optional cx, VectorRef keyRangesFilter, bool logsOnly, Version beginVersion) { return BackupContainerFileSystemImpl::getRestoreSet( - Reference::addRef(this), targetVersion, keyRangesFilter, logsOnly, beginVersion); + Reference::addRef(this), targetVersion, keyRangesFilter, cx, logsOnly, beginVersion); } Future> BackupContainerFileSystem::VersionProperty::get() { @@ -1523,11 +1528,46 @@ Reference BackupContainerFileSystem::openContainerFS( #ifdef BUILD_AZURE_BACKUP else if (u.startsWith("azure://"_sr)) { u.eat("azure://"_sr); - auto accountName = u.eat("@"_sr).toString(); - auto endpoint = u.eat("/"_sr).toString(); - auto containerName = u.eat("/"_sr).toString(); - r = makeReference( - endpoint, accountName, containerName, encryptionKeyFileName); + auto address = u.eat("/"_sr); + if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) { + CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint"); + // ..core.windows.net/ + auto endPoint = address.toString(); + auto accountName = address.eat("."_sr).toString(); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endPoint, accountName, containerName, encryptionKeyFileName); + } else { + // resolve the network address if necessary + std::string endpoint(address.toString()); + Optional parsedAddress = NetworkAddress::parseOptional(endpoint); + if (!parsedAddress.present()) { + try { + auto hostname = Hostname::parse(endpoint); + auto resolvedAddress = hostname.resolveBlocking(); + if (resolvedAddress.present()) { + CODE_PROBE(true, "Azure backup url with hostname in the endpoint"); + parsedAddress = resolvedAddress.get(); + } + } catch (Error& e) { + TraceEvent(SevError, "InvalidAzureBackupUrl").error(e).detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + } + if (!parsedAddress.present()) { + TraceEvent(SevError, "InvalidAzureBackupUrl").detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + auto accountName = u.eat("/"_sr).toString(); + // Avoid including ":tls" and "(fromHostname)" + // note: the endpoint needs to contain the account name + // so either ".blob.core.windows.net" or ":/" + endpoint = + fmt::format("{}/{}", formatIpPort(parsedAddress.get().ip, parsedAddress.get().port), accountName); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endpoint, accountName, containerName, encryptionKeyFileName); + } } #endif else { @@ -1631,7 +1671,8 @@ ACTOR static Future testWriteSnapshotFile(Reference file, Key ACTOR Future testBackupContainer(std::string url, Optional proxy, - Optional encryptionKeyFileName) { + Optional encryptionKeyFileName, + Optional cx) { state FlowLock lock(100e6); if (encryptionKeyFileName.present()) { @@ -1662,7 +1703,7 @@ ACTOR Future testBackupContainer(std::string url, // List of sizes to use to test edge cases on underlying file implementations state std::vector fileSizes = { 0 }; - if (StringRef(url).startsWith(LiteralStringRef("blob"))) { + if (StringRef(url).startsWith("blob"_sr)) { fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE); fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE + 10); } @@ -1670,8 +1711,8 @@ ACTOR Future testBackupContainer(std::string url, loop { state Version logStart = v; state int kvfiles = deterministicRandom()->randomInt(0, 3); - state Key begin = LiteralStringRef(""); - state Key end = LiteralStringRef(""); + state Key begin = ""_sr; + state Key end = ""_sr; state int blockSize = 3 * sizeof(uint32_t) + begin.size() + end.size() + 8; while (kvfiles > 0) { @@ -1738,13 +1779,13 @@ ACTOR Future testBackupContainer(std::string url, for (; i < listing.snapshots.size(); ++i) { { // Ensure we can still restore to the latest version - Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); + Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get(), cx)); ASSERT(rest.present()); } { // Ensure we can restore to the end version of snapshot i - Optional rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion)); + Optional rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion, cx)); ASSERT(rest.present()); } @@ -1785,14 +1826,16 @@ ACTOR Future testBackupContainer(std::string url, } TEST_CASE("/backup/containers/localdir/unencrypted") { - wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {})); + wait(testBackupContainer( + format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}, {})); return Void(); } TEST_CASE("/backup/containers/localdir/encrypted") { wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, - format("%s/test_encryption_key", params.getDataDir().c_str()))); + format("%s/test_encryption_key", params.getDataDir().c_str()), + {})); return Void(); } @@ -1800,7 +1843,7 @@ TEST_CASE("/backup/containers/url") { if (!g_network->isSimulated()) { const char* url = getenv("FDB_TEST_BACKUP_URL"); ASSERT(url != nullptr); - wait(testBackupContainer(url, {}, {})); + wait(testBackupContainer(url, {}, {}, {})); } return Void(); } diff --git a/fdbclient/BackupContainerLocalDirectory.actor.cpp b/fdbclient/BackupContainerLocalDirectory.actor.cpp index 528910dabc..51abc24678 100644 --- a/fdbclient/BackupContainerLocalDirectory.actor.cpp +++ b/fdbclient/BackupContainerLocalDirectory.actor.cpp @@ -103,16 +103,15 @@ ACTOR static Future listFiles_impl(st // Remove .lnk files from results, they are a side effect of a backup that was *read* during simulation. See // openFile() above for more info on why they are created. if (g_network->isSimulated()) - files.erase( - std::remove_if(files.begin(), - files.end(), - [](std::string const& f) { return StringRef(f).endsWith(LiteralStringRef(".lnk")); }), - files.end()); + files.erase(std::remove_if(files.begin(), + files.end(), + [](std::string const& f) { return StringRef(f).endsWith(".lnk"_sr); }), + files.end()); for (const auto& f : files) { // Hide .part or .temp files. StringRef s(f); - if (!s.endsWith(LiteralStringRef(".part")) && !s.endsWith(LiteralStringRef(".temp"))) + if (!s.endsWith(".part"_sr) && !s.endsWith(".temp"_sr)) results.push_back({ f.substr(m_path.size() + 1), ::fileSize(f) }); } @@ -227,10 +226,10 @@ Future> BackupContainerLocalDirectory::readFile(const std: throw file_not_found(); } - if (g_simulator.getCurrentProcess()->uid == UID()) { + if (g_simulator->getCurrentProcess()->uid == UID()) { TraceEvent(SevError, "BackupContainerReadFileOnUnsetProcessID").log(); } - std::string uniquePath = fullPath + "." + g_simulator.getCurrentProcess()->uid.toString() + ".lnk"; + std::string uniquePath = fullPath + "." + g_simulator->getCurrentProcess()->uid.toString() + ".lnk"; unlink(uniquePath.c_str()); ASSERT(symlink(basename(path).c_str(), uniquePath.c_str()) == 0); fullPath = uniquePath; diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp new file mode 100644 index 0000000000..09a8bbd5ee --- /dev/null +++ b/fdbclient/BlobCipher.cpp @@ -0,0 +1,1782 @@ +/* + * BlobCipher.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/BlobCipher.h" + +#include "fdbclient/Knobs.h" +#include "flow/Arena.h" +#include "flow/EncryptUtils.h" +#include "flow/Knobs.h" +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/IRandom.h" +#include "flow/ITrace.h" +#include "flow/Platform.h" +#include "flow/flow.h" +#include "flow/network.h" +#include "flow/Trace.h" +#include "flow/UnitTest.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32 +#include +#else +#include +#endif + +#define BLOB_CIPHER_DEBUG false + +// BlobCipherMetrics methods + +BlobCipherMetrics::CounterSet::CounterSet(CounterCollection& cc, std::string name) + : encryptCPUTimeNS(name + "EncryptCPUTimeNS", cc), decryptCPUTimeNS(name + "DecryptCPUTimeNS", cc), + getCipherKeysLatency(name + "GetCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), + getLatestCipherKeysLatency(name + "GetLatestCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE) {} + +BlobCipherMetrics::BlobCipherMetrics() + : cc("BlobCipher"), cipherKeyCacheHit("CipherKeyCacheHit", cc), cipherKeyCacheMiss("CipherKeyCacheMiss", cc), + cipherKeyCacheExpired("CipherKeyCacheExpired", cc), latestCipherKeyCacheHit("LatestCipherKeyCacheHit", cc), + latestCipherKeyCacheMiss("LatestCipherKeyCacheMiss", cc), + latestCipherKeyCacheNeedsRefresh("LatestCipherKeyCacheNeedsRefresh", cc), + getCipherKeysLatency("GetCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), + getLatestCipherKeysLatency("GetLatestCipherKeysLatency", + UID(), + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, + FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), + counterSets({ CounterSet(cc, "TLog"), + CounterSet(cc, "KVMemory"), + CounterSet(cc, "KVRedwood"), + CounterSet(cc, "BlobGranule"), + CounterSet(cc, "Backup"), + CounterSet(cc, "Test") }) { + specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); }); + traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc); +} + +// BlobCipherKey class methods + +BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCiphId, + const uint8_t* baseCiph, + int baseCiphLen, + const int64_t refreshAt, + const int64_t expireAt) { + // Salt generated is used while applying HMAC Key derivation, hence, not using crypto-secure hash algorithm is ok. + // Further, 'deterministic' salt generation is used to preserve simulation determinism properties. + EncryptCipherRandomSalt salt; + if (g_network->isSimulated()) { + salt = deterministicRandom()->randomUInt64(); + } else { + salt = nondeterministicRandom()->randomUInt64(); + } + + // Support two type of CipherKeys: 'revocable' & 'non-revocable' ciphers. + // In all cases, either cipherKey never expires i.e. refreshAt == infinite, or, refreshAt needs <= expireAt + // timestamp. + ASSERT(refreshAt == std::numeric_limits::max() || (refreshAt <= expireAt)); + + initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt, refreshAt, expireAt); +} + +BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCiphId, + const uint8_t* baseCiph, + int baseCiphLen, + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt) { + initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt, refreshAt, expireAt); +} + +void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId, + const uint8_t* baseCiph, + int baseCiphLen, + const EncryptCipherBaseKeyId& baseCiphId, + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt) { + // Set the base encryption key properties + baseCipher = std::make_unique(AES_256_KEY_LENGTH); + memset(baseCipher.get(), 0, AES_256_KEY_LENGTH); + memcpy(baseCipher.get(), baseCiph, std::min(baseCiphLen, AES_256_KEY_LENGTH)); + baseCipherLen = baseCiphLen; + baseCipherId = baseCiphId; + // Set the encryption domain for the base encryption key + encryptDomainId = domainId; + randomSalt = salt; + // derive the encryption key + cipher = std::make_unique(AES_256_KEY_LENGTH); + memset(cipher.get(), 0, AES_256_KEY_LENGTH); + applyHmacSha256Derivation(); + // update cipher 'refresh' and 'expire' TS + refreshAtTS = refreshAt; + expireAtTS = expireAt; + +#if BLOB_CIPHER_DEBUG + TraceEvent(SevDebug, "BlobCipherKeyInit") + .detail("DomainId", domainId) + .detail("BaseCipherId", baseCipherId) + .detail("BaseCipherLen", baseCipherLen) + .detail("RandomSalt", randomSalt) + .detail("RefreshAt", refreshAtTS) + .detail("ExpireAtTS", expireAtTS); +#endif +} + +void BlobCipherKey::applyHmacSha256Derivation() { + Arena arena; + uint8_t buf[baseCipherLen + sizeof(EncryptCipherRandomSalt)]; + memcpy(&buf[0], baseCipher.get(), baseCipherLen); + memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt)); + HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen); + unsigned int digestLen = hmacGen.digest( + { { &buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt) } }, cipher.get(), AUTH_TOKEN_HMAC_SHA_SIZE); + if (digestLen < AES_256_KEY_LENGTH) { + memcpy(cipher.get() + digestLen, buf, AES_256_KEY_LENGTH - digestLen); + } +} + +void BlobCipherKey::reset() { + memset(baseCipher.get(), 0, baseCipherLen); + memset(cipher.get(), 0, AES_256_KEY_LENGTH); +} + +// BlobKeyIdCache class methods + +BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId, size_t* sizeStat) + : domainId(dId), latestBaseCipherKeyId(), latestRandomSalt(), sizeStat(sizeStat) { + ASSERT(sizeStat != nullptr); + TraceEvent(SevInfo, "BlobCipherKeyIdCacheInit").detail("DomainId", domainId); +} + +BlobCipherKeyIdCacheKey BlobCipherKeyIdCache::getCacheKey(const EncryptCipherBaseKeyId& baseCipherKeyId, + const EncryptCipherRandomSalt& salt) { + if (baseCipherKeyId == INVALID_ENCRYPT_CIPHER_KEY_ID || salt == INVALID_ENCRYPT_RANDOM_SALT) { + throw encrypt_invalid_id(); + } + return std::make_pair(baseCipherKeyId, salt); +} + +Reference BlobCipherKeyIdCache::getLatestCipherKey() { + if (!latestBaseCipherKeyId.present()) { + return Reference(); + } + ASSERT_NE(latestBaseCipherKeyId.get(), INVALID_ENCRYPT_CIPHER_KEY_ID); + ASSERT(latestRandomSalt.present()); + ASSERT_NE(latestRandomSalt.get(), INVALID_ENCRYPT_RANDOM_SALT); + + return getCipherByBaseCipherId(latestBaseCipherKeyId.get(), latestRandomSalt.get()); +} + +Reference BlobCipherKeyIdCache::getCipherByBaseCipherId(const EncryptCipherBaseKeyId& baseCipherKeyId, + const EncryptCipherRandomSalt& salt) { + BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(getCacheKey(baseCipherKeyId, salt)); + if (itr == keyIdCache.end()) { + return Reference(); + } + return itr->second; +} + +Reference BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen, + const int64_t refreshAt, + const int64_t expireAt) { + ASSERT_GT(baseCipherId, INVALID_ENCRYPT_CIPHER_KEY_ID); + + // BaseCipherKeys are immutable, given the routine invocation updates 'latestCipher', + // ensure no key-tampering is done + Reference latestCipherKey = getLatestCipherKey(); + if (latestCipherKey.isValid() && latestCipherKey->getBaseCipherId() == baseCipherId) { + if (memcmp(latestCipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { +#if BLOB_CIPHER_DEBUG + TraceEvent(SevDebug, "InsertBaseCipherKeyAlreadyPresent") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); +#endif + + // Key is already present; nothing more to do. + return latestCipherKey; + } else { + TraceEvent(SevInfo, "BlobCipherUpdatetBaseCipherKey") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + throw encrypt_update_cipher(); + } + } + + TraceEvent(SevInfo, "BlobCipherKeyInsertBaseCipherKeyLatest") + .detail("DomainId", domainId) + .detail("BaseCipherId", baseCipherId) + .detail("RefreshAt", refreshAt) + .detail("ExpireAt", expireAt); + + Reference cipherKey = + makeReference(domainId, baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); + BlobCipherKeyIdCacheKey cacheKey = getCacheKey(cipherKey->getBaseCipherId(), cipherKey->getSalt()); + keyIdCache.emplace(cacheKey, cipherKey); + + // Update the latest BaseCipherKeyId for the given encryption domain + latestBaseCipherKeyId = baseCipherId; + latestRandomSalt = cipherKey->getSalt(); + + (*sizeStat)++; + return cipherKey; +} + +Reference BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen, + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt) { + ASSERT_NE(baseCipherId, INVALID_ENCRYPT_CIPHER_KEY_ID); + ASSERT_NE(salt, INVALID_ENCRYPT_RANDOM_SALT); + + BlobCipherKeyIdCacheKey cacheKey = getCacheKey(baseCipherId, salt); + + // BaseCipherKeys are immutable, ensure that cached value doesn't get updated. + BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(cacheKey); + if (itr != keyIdCache.end()) { + if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { +#if BLOB_CIPHER_DEBUG + TraceEvent(SevDebug, "InsertBaseCipherKeyAlreadyPresent") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); +#endif + + // Key is already present; nothing more to do. + return itr->second; + } else { + TraceEvent(SevInfo, "BlobCipherUpdateBaseCipherKey") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + throw encrypt_update_cipher(); + } + } + + TraceEvent(SevInfo, "BlobCipherKeyInsertBaseCipherKey") + .detail("DomainId", domainId) + .detail("BaseCipherId", baseCipherId) + .detail("Salt", salt) + .detail("RefreshAt", refreshAt) + .detail("ExpireAt", expireAt); + + Reference cipherKey = + makeReference(domainId, baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt); + keyIdCache.emplace(cacheKey, cipherKey); + (*sizeStat)++; + return cipherKey; +} + +void BlobCipherKeyIdCache::cleanup() { + for (auto& keyItr : keyIdCache) { + keyItr.second->reset(); + } + + keyIdCache.clear(); +} + +std::vector> BlobCipherKeyIdCache::getAllCipherKeys() { + std::vector> cipherKeys; + for (auto& keyItr : keyIdCache) { + cipherKeys.push_back(keyItr.second); + } + return cipherKeys; +} + +// BlobCipherKeyCache class methods + +Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen, + const int64_t refreshAt, + const int64_t expireAt) { + if (domainId == INVALID_ENCRYPT_DOMAIN_ID || baseCipherId == INVALID_ENCRYPT_CIPHER_KEY_ID) { + throw encrypt_invalid_id(); + } + + Reference cipherKey; + + try { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + // Add mapping to track new encryption domain + Reference keyIdCache = makeReference(domainId, &size); + cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); + domainCacheMap.emplace(domainId, keyIdCache); + } else { + // Track new baseCipher keys + Reference keyIdCache = domainItr->second; + cipherKey = keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, refreshAt, expireAt); + } + } catch (Error& e) { + TraceEvent(SevWarn, "BlobCipherInsertCipherKeyFailed") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + throw; + } + return cipherKey; +} + +Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen, + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt) { + if (domainId == INVALID_ENCRYPT_DOMAIN_ID || baseCipherId == INVALID_ENCRYPT_CIPHER_KEY_ID || + salt == INVALID_ENCRYPT_RANDOM_SALT) { + throw encrypt_invalid_id(); + } + + Reference cipherKey; + try { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + // Add mapping to track new encryption domain + Reference keyIdCache = makeReference(domainId, &size); + cipherKey = + keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt); + domainCacheMap.emplace(domainId, keyIdCache); + } else { + // Track new baseCipher keys + Reference keyIdCache = domainItr->second; + cipherKey = + keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen, salt, refreshAt, expireAt); + } + } catch (Error& e) { + TraceEvent(SevWarn, "BlobCipherInsertCipherKey_Failed") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId) + .detail("Salt", salt); + throw; + } + return cipherKey; +} + +Reference BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) { + if (domainId == INVALID_ENCRYPT_DOMAIN_ID) { + TraceEvent(SevWarn, "BlobCipherGetLatestCipherKeyInvalidID").detail("DomainId", domainId); + throw encrypt_invalid_id(); + } + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + TraceEvent(SevInfo, "BlobCipherGetLatestCipherKeyDomainNotFound").detail("DomainId", domainId); + return Reference(); + } + + Reference keyIdCache = domainItr->second; + Reference cipherKey = keyIdCache->getLatestCipherKey(); + + // Ensure 'freshness' guarantees for the latestCipher + if (cipherKey.isValid()) { + if (cipherKey->needsRefresh()) { +#if BLOB_CIPHER_DEBUG + TraceEvent("SevDebug, BlobCipherGetLatestNeedsRefresh") + .detail("DomainId", domainId) + .detail("Now", now()) + .detail("RefreshAt", cipherKey->getRefreshAtTS()); +#endif + ++BlobCipherMetrics::getInstance()->latestCipherKeyCacheNeedsRefresh; + return Reference(); + } + ++BlobCipherMetrics::getInstance()->latestCipherKeyCacheHit; + } else { + ++BlobCipherMetrics::getInstance()->latestCipherKeyCacheMiss; + } + + return cipherKey; +} + +Reference BlobCipherKeyCache::getCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, + const EncryptCipherRandomSalt& salt) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + return Reference(); + } + + Reference keyIdCache = domainItr->second; + Reference cipherKey = keyIdCache->getCipherByBaseCipherId(baseCipherId, salt); + + // Ensure 'liveness' guarantees for the cipher + if (cipherKey.isValid()) { + if (cipherKey->isExpired()) { +#if BLOB_CIPHER_DEBUG + TraceEvent(SevDebug, "BlobCipherGetCipherExpired") + .detail("DomainId", domainId) + .detail("BaseCipherId", baseCipherId) + .detail("Now", now()) + .detail("ExpireAt", cipherKey->getExpireAtTS()); +#endif + ++BlobCipherMetrics::getInstance()->cipherKeyCacheExpired; + return Reference(); + } + ++BlobCipherMetrics::getInstance()->cipherKeyCacheHit; + } else { + ++BlobCipherMetrics::getInstance()->cipherKeyCacheMiss; + } + + return cipherKey; +} + +void BlobCipherKeyCache::resetEncryptDomainId(const EncryptCipherDomainId domainId) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + return; + } + + Reference keyIdCache = domainItr->second; + ASSERT(keyIdCache->getSize() <= size); + size -= keyIdCache->getSize(); + keyIdCache->cleanup(); + TraceEvent(SevInfo, "BlobCipherResetEncryptDomainId").detail("DomainId", domainId); +} + +void BlobCipherKeyCache::cleanup() noexcept { + Reference instance = BlobCipherKeyCache::getInstance(); + + TraceEvent(SevInfo, "BlobCipherKeyCacheCleanup").log(); + + for (auto& domainItr : instance->domainCacheMap) { + Reference keyIdCache = domainItr.second; + keyIdCache->cleanup(); + TraceEvent(SevInfo, "BlobCipherKeyCacheCleanup").detail("DomainId", domainItr.first); + } + + instance->domainCacheMap.clear(); + instance->size = 0; +} + +std::vector> BlobCipherKeyCache::getAllCiphers(const EncryptCipherDomainId& domainId) { + auto domainItr = domainCacheMap.find(domainId); + if (domainItr == domainCacheMap.end()) { + return {}; + } + + Reference keyIdCache = domainItr->second; + return keyIdCache->getAllCipherKeys(); +} + +// EncryptBlobCipherAes265Ctr class methods + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* cipherIV, + const int ivLen, + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + usageType(usageType) { + ASSERT_EQ(ivLen, AES_256_IV_LENGTH); + authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); + memcpy(&iv[0], cipherIV, ivLen); + init(); +} + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* cipherIV, + const int ivLen, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + authTokenAlgo(algo), usageType(usageType) { + ASSERT_EQ(ivLen, AES_256_IV_LENGTH); + memcpy(&iv[0], cipherIV, ivLen); + init(); +} + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + usageType(usageType) { + authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); + deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); + init(); +} + +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + usageType(usageType) { + deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); + init(); +} + +void EncryptBlobCipherAes265Ctr::init() { + ASSERT(textCipherKey.isValid()); + ASSERT(headerCipherKey.isValid()); + + if (!isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)) { + TraceEvent(SevWarn, "InvalidAuthTokenDetails") + .detail("TokenMode", authTokenMode) + .detail("TokenAlgo", authTokenAlgo); + throw internal_error(); + } + + if (ctx == nullptr) { + throw encrypt_ops_error(); + } + if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) { + throw encrypt_ops_error(); + } + if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, textCipherKey.getPtr()->data(), iv) != 1) { + throw encrypt_ops_error(); + } +} + +Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plaintext, + const int plaintextLen, + BlobCipherEncryptHeader* header, + Arena& arena) { + double startTime = 0.0; + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + startTime = timer_monotonic(); + } + + memset(reinterpret_cast(header), 0, sizeof(BlobCipherEncryptHeader)); + + // Alloc buffer computation accounts for 'header authentication' generation scheme. If single-auth-token needs + // to be generated, allocate buffer sufficient to append header to the cipherText optimizing memcpy cost. + + const int allocSize = plaintextLen + AES_BLOCK_SIZE; + Reference encryptBuf = makeReference(allocSize, arena); + uint8_t* ciphertext = encryptBuf->begin(); + + int bytes{ 0 }; + if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) { + TraceEvent(SevWarn, "BlobCipherEncryptUpdateFailed") + .detail("BaseCipherId", textCipherKey->getBaseCipherId()) + .detail("EncryptDomainId", textCipherKey->getDomainId()); + throw encrypt_ops_error(); + } + + int finalBytes{ 0 }; + if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) { + TraceEvent(SevWarn, "BlobCipherEncryptFinalFailed") + .detail("BaseCipherId", textCipherKey->getBaseCipherId()) + .detail("EncryptDomainId", textCipherKey->getDomainId()); + throw encrypt_ops_error(); + } + + if ((bytes + finalBytes) != plaintextLen) { + TraceEvent(SevWarn, "BlobCipherEncryptUnexpectedCipherLen") + .detail("PlaintextLen", plaintextLen) + .detail("EncryptedBufLen", bytes + finalBytes); + throw encrypt_ops_error(); + } + + // Populate encryption header flags details + header->flags.size = sizeof(BlobCipherEncryptHeader); + header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION; + header->flags.encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR; + header->flags.authTokenMode = authTokenMode; + header->flags.authTokenAlgo = authTokenAlgo; + + // Ensure encryption header authToken details sanity + ASSERT(isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)); + + // Populate cipherText encryption-key details + header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId(); + header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId(); + header->cipherTextDetails.salt = textCipherKey->getSalt(); + // Populate header encryption-key details + // TODO: HeaderCipherKey is not necessary if AuthTokenMode == NONE + header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId(); + header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId(); + header->cipherHeaderDetails.salt = headerCipherKey->getSalt(); + + memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH); + + if (authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + // No header 'authToken' generation needed. + } else { + + // Populate header authToken details + if (header->flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { + ASSERT_GE(allocSize, (bytes + finalBytes)); + ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes)); + + computeAuthToken({ { ciphertext, bytes + finalBytes }, + { reinterpret_cast(header), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &header->singleAuthToken.authToken[0], + (EncryptAuthTokenAlgo)header->flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); + } else { + ASSERT_EQ(header->flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + + // TOOD: Use HMAC_SHA encyrption authentication scheme as AES_CMAC needs minimum 16 bytes cipher key + computeAuthToken({ { ciphertext, bytes + finalBytes } }, + reinterpret_cast(&header->cipherTextDetails.salt), + sizeof(EncryptCipherRandomSalt), + &header->multiAuthTokens.cipherTextAuthToken[0], + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + AUTH_TOKEN_MAX_SIZE); + computeAuthToken({ { reinterpret_cast(header), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &header->multiAuthTokens.headerAuthToken[0], + (EncryptAuthTokenAlgo)header->flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); + } + } + + encryptBuf->setLogicalSize(plaintextLen); + + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + BlobCipherMetrics::counters(usageType).encryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9); + } + + CODE_PROBE(true, "BlobCipher data encryption"); + CODE_PROBE(header->flags.authTokenAlgo == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, + "Encryption authentication disabled"); + CODE_PROBE(header->flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + "HMAC_SHA Auth token generation"); + CODE_PROBE(header->flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + "AES_CMAC Auth token generation"); + + return encryptBuf; +} + +EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() { + if (ctx != nullptr) { + EVP_CIPHER_CTX_free(ctx); + } +} + +// DecryptBlobCipherAes256Ctr class methods + +DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* iv, + BlobCipherMetrics::UsageType usageType) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), + headerAuthTokenValidationDone(false), authTokensValidationDone(false), usageType(usageType) { + if (ctx == nullptr) { + throw encrypt_ops_error(); + } + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) { + throw encrypt_ops_error(); + } + if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, tCipherKey.getPtr()->data(), iv)) { + throw encrypt_ops_error(); + } +} + +void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena) { + if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI) { + // NoneAuthToken mode; no authToken is generated; nothing to do + // SingleAuthToken mode; verification will happen as part of decryption. + return; + } + + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT(isEncryptHeaderAuthTokenAlgoValid((EncryptAuthTokenAlgo)header.flags.authTokenAlgo)); + + BlobCipherEncryptHeader headerCopy; + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + memset(reinterpret_cast(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_MAX_SIZE); + uint8_t computedHeaderAuthToken[AUTH_TOKEN_MAX_SIZE]{}; + computeAuthToken({ { reinterpret_cast(&headerCopy), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &computedHeaderAuthToken[0], + (EncryptAuthTokenAlgo)header.flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); + + int authTokenSize = getEncryptHeaderAuthTokenSize(header.flags.authTokenAlgo); + ASSERT_LE(authTokenSize, AUTH_TOKEN_MAX_SIZE); + if (memcmp(&header.multiAuthTokens.headerAuthToken[0], &computedHeaderAuthToken[0], authTokenSize) != 0) { + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("MultiAuthHeaderAuthToken", + StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("ComputedHeaderAuthToken", StringRef(computedHeaderAuthToken, AUTH_TOKEN_MAX_SIZE)); + throw encrypt_header_authtoken_mismatch(); + } + + headerAuthTokenValidationDone = true; +} + +void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena) { + // Header authToken not set for single auth-token mode. + ASSERT(!headerAuthTokenValidationDone); + + // prepare the payload {cipherText + encryptionHeader} + // ensure the 'authToken' is reset before computing the 'authentication token' + BlobCipherEncryptHeader headerCopy; + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + memset(reinterpret_cast(&headerCopy.singleAuthToken), 0, 2 * AUTH_TOKEN_MAX_SIZE); + uint8_t computed[AUTH_TOKEN_MAX_SIZE]; + computeAuthToken({ { ciphertext, ciphertextLen }, + { reinterpret_cast(&headerCopy), sizeof(BlobCipherEncryptHeader) } }, + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + &computed[0], + (EncryptAuthTokenAlgo)header.flags.authTokenAlgo, + AUTH_TOKEN_MAX_SIZE); + + int authTokenSize = getEncryptHeaderAuthTokenSize(header.flags.authTokenAlgo); + ASSERT_LE(authTokenSize, AUTH_TOKEN_MAX_SIZE); + if (memcmp(&header.singleAuthToken.authToken[0], &computed[0], authTokenSize) != 0) { + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("SingleAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("ComputedSingleAuthToken", StringRef(computed, AUTH_TOKEN_MAX_SIZE)); + throw encrypt_header_authtoken_mismatch(); + } +} + +void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena) { + if (!headerAuthTokenValidationDone) { + verifyHeaderAuthToken(header, arena); + } + uint8_t computedCipherTextAuthToken[AUTH_TOKEN_MAX_SIZE]; + // TOOD: Use HMAC_SHA encyrption authentication scheme as AES_CMAC needs minimum 16 bytes cipher key + computeAuthToken({ { ciphertext, ciphertextLen } }, + reinterpret_cast(&header.cipherTextDetails.salt), + sizeof(EncryptCipherRandomSalt), + &computedCipherTextAuthToken[0], + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + AUTH_TOKEN_MAX_SIZE); + if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], &computedCipherTextAuthToken[0], AUTH_TOKEN_MAX_SIZE) != + 0) { + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("MultiAuthCipherTextAuthToken", + StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("ComputedCipherTextAuthToken", StringRef(computedCipherTextAuthToken, AUTH_TOKEN_MAX_SIZE)); + throw encrypt_header_authtoken_mismatch(); + } +} + +void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena) { + if (header.flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { + verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, arena); + } else { + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, arena); + } + + authTokensValidationDone = true; +} + +void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header) { + // validate header flag sanity + if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION || + header.flags.encryptMode != EncryptCipherMode::ENCRYPT_CIPHER_MODE_AES_256_CTR || + !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) { + TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeader") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION) + .detail("EncryptCipherMode", header.flags.encryptMode) + .detail("ExpectedCipherMode", EncryptCipherMode::ENCRYPT_CIPHER_MODE_AES_256_CTR) + .detail("EncryptHeaderAuthTokenMode", header.flags.authTokenMode); + throw encrypt_header_metadata_mismatch(); + } +} + +Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + Arena& arena) { + double startTime = 0.0; + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + startTime = timer_monotonic(); + } + + verifyEncryptHeaderMetadata(header); + + if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && + !headerCipherKey.isValid()) { + TraceEvent(SevWarn, "BlobCipherDecryptInvalidHeaderCipherKey") + .detail("AuthTokenMode", header.flags.authTokenMode); + throw encrypt_ops_error(); + } + + const int allocSize = ciphertextLen + AES_BLOCK_SIZE; + Reference decrypted = makeReference(allocSize, arena); + + if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + verifyAuthTokens(ciphertext, ciphertextLen, header, arena); + ASSERT(authTokensValidationDone); + } + + uint8_t* plaintext = decrypted->begin(); + int bytesDecrypted{ 0 }; + if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) { + TraceEvent(SevWarn, "BlobCipherDecryptUpdateFailed") + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId); + throw encrypt_ops_error(); + } + + int finalBlobBytes{ 0 }; + if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) { + TraceEvent(SevWarn, "BlobCipherDecryptFinalFailed") + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId); + throw encrypt_ops_error(); + } + + if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) { + TraceEvent(SevWarn, "BlobCipherEncryptUnexpectedPlaintextLen") + .detail("CiphertextLen", ciphertextLen) + .detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes); + throw encrypt_ops_error(); + } + + decrypted->setLogicalSize(ciphertextLen); + + if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { + BlobCipherMetrics::counters(usageType).decryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9); + } + + CODE_PROBE(true, "BlobCipher data decryption"); + CODE_PROBE(header.flags.authTokenAlgo == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, + "Decryption authentication disabled"); + CODE_PROBE(header.flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + "Decryption HMAC_SHA Auth token verification"); + CODE_PROBE(header.flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + "Decryption AES_CMAC Auth token verification"); + + return decrypted; +} + +DecryptBlobCipherAes256Ctr::~DecryptBlobCipherAes256Ctr() { + if (ctx != nullptr) { + EVP_CIPHER_CTX_free(ctx); + } +} + +// HmacSha256DigestGen class methods + +HmacSha256DigestGen::HmacSha256DigestGen(const unsigned char* key, size_t len) : ctx(HMAC_CTX_new()) { + if (!HMAC_Init_ex(ctx, key, len, EVP_sha256(), nullptr)) { + throw encrypt_ops_error(); + } +} + +HmacSha256DigestGen::~HmacSha256DigestGen() { + if (ctx != nullptr) { + HMAC_CTX_free(ctx); + } +} + +unsigned int HmacSha256DigestGen::digest(const std::vector>& payload, + unsigned char* buf, + unsigned int bufLen) { + ASSERT_EQ(bufLen, HMAC_size(ctx)); + + for (const auto& p : payload) { + if (HMAC_Update(ctx, p.first, p.second) != 1) { + throw encrypt_ops_error(); + } + } + + unsigned int digestLen = 0; + if (HMAC_Final(ctx, buf, &digestLen) != 1) { + throw encrypt_ops_error(); + } + + CODE_PROBE(true, "HMAC_SHA Digest generation"); + + return digestLen; +} + +// Aes256CtrCmacDigestGen methods +Aes256CmacDigestGen::Aes256CmacDigestGen(const unsigned char* key, size_t keylen) : ctx(CMAC_CTX_new()) { + ASSERT_EQ(keylen, AES_256_KEY_LENGTH); + + if (ctx == nullptr) { + throw encrypt_ops_error(); + } + if (!CMAC_Init(ctx, key, keylen, EVP_aes_256_cbc(), NULL)) { + throw encrypt_ops_error(); + } +} + +size_t Aes256CmacDigestGen::digest(const std::vector>& payload, + uint8_t* digest, + int digestlen) { + ASSERT(ctx != nullptr); + ASSERT_GE(digestlen, AUTH_TOKEN_AES_CMAC_SIZE); + + for (const auto& p : payload) { + if (!CMAC_Update(ctx, p.first, p.second)) { + throw encrypt_ops_error(); + } + } + size_t ret; + if (!CMAC_Final(ctx, digest, &ret)) { + throw encrypt_ops_error(); + } + + return ret; +} + +Aes256CmacDigestGen::~Aes256CmacDigestGen() { + if (ctx != nullptr) { + CMAC_CTX_free(ctx); + } +} + +void computeAuthToken(const std::vector>& payload, + const uint8_t* key, + const int keyLen, + unsigned char* digestBuf, + const EncryptAuthTokenAlgo algo, + unsigned int digestBufMaxSz) { + ASSERT_EQ(digestBufMaxSz, AUTH_TOKEN_MAX_SIZE); + ASSERT(isEncryptHeaderAuthTokenAlgoValid(algo)); + + int authTokenSz = getEncryptHeaderAuthTokenSize(algo); + ASSERT_LE(authTokenSz, AUTH_TOKEN_MAX_SIZE); + + if (algo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { + ASSERT_EQ(authTokenSz, AUTH_TOKEN_HMAC_SHA_SIZE); + + HmacSha256DigestGen hmacGenerator(key, keyLen); + unsigned int digestLen = hmacGenerator.digest(payload, digestBuf, authTokenSz); + + ASSERT_EQ(digestLen, authTokenSz); + } else if (algo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC) { + ASSERT_EQ(authTokenSz, AUTH_TOKEN_AES_CMAC_SIZE); + ASSERT_EQ(keyLen, AES_256_KEY_LENGTH); + + Aes256CmacDigestGen cmacGenerator(key, keyLen); + size_t digestLen = cmacGenerator.digest(payload, digestBuf, authTokenSz); + + ASSERT_EQ(digestLen, authTokenSz); + } else { + throw not_implemented(); + } +} + +EncryptAuthTokenMode getEncryptAuthTokenMode(const EncryptAuthTokenMode mode) { + // Override mode if authToken isn't enabled + return FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED ? mode + : EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE; +} + +// Only used to link unit tests +void forceLinkBlobCipherTests() {} + +// Tests cases includes: +// 1. Populate cache by inserting 'baseCipher' details for new encryptionDomainIds +// 2. Random lookup for cipherKeys and content validation +// 3. Inserting of 'identical' cipherKey (already cached) more than once works as desired. +// 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired. +// 5. Validation encryption ops (correctness): +// 5.1. Encrypt a buffer followed by decryption of the buffer, validate the contents. +// 5.2. Simulate anomalies such as: EncryptionHeader corruption, authToken mismatch / encryptionMode mismatch etc. +// 6. Cache cleanup +// 6.1 cleanup cipherKeys by given encryptDomainId +// 6.2. Cleanup all cached cipherKeys +TEST_CASE("flow/BlobCipher") { + TraceEvent("BlobCipherTestStart").log(); + + // Construct a dummy External Key Manager representation and populate with some keys + class BaseCipher : public ReferenceCounted, NonCopyable { + public: + EncryptCipherDomainId domainId; + int len; + EncryptCipherBaseKeyId keyId; + std::unique_ptr key; + int64_t refreshAt; + int64_t expireAt; + EncryptCipherRandomSalt generatedSalt; + + BaseCipher(const EncryptCipherDomainId& dId, + const EncryptCipherBaseKeyId& kId, + const int64_t rAt, + const int64_t eAt) + : domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)), + keyId(kId), key(std::make_unique(len)), refreshAt(rAt), expireAt(eAt) { + deterministicRandom()->randomBytes(key.get(), len); + } + }; + + using BaseKeyMap = std::unordered_map>; + using DomainKeyMap = std::unordered_map; + DomainKeyMap domainKeyMap; + const EncryptCipherDomainId minDomainId = 1; + const EncryptCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5; + const EncryptCipherBaseKeyId minBaseCipherKeyId = 100; + const EncryptCipherBaseKeyId maxBaseCipherKeyId = + deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15; + for (int dId = minDomainId; dId <= maxDomainId; dId++) { + for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) { + domainKeyMap[dId].emplace( + kId, + makeReference( + dId, kId, std::numeric_limits::max(), std::numeric_limits::max())); + } + } + ASSERT_EQ(domainKeyMap.size(), maxDomainId); + + Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); + + // validate getLatestCipherKey return empty when there's no cipher key + TraceEvent("BlobCipherTestLatestKeyNotExists").log(); + Reference latestKeyNonexists = + cipherKeyCache->getLatestCipherKey(deterministicRandom()->randomInt(minDomainId, maxDomainId)); + ASSERT(!latestKeyNonexists.isValid()); + try { + cipherKeyCache->getLatestCipherKey(INVALID_ENCRYPT_DOMAIN_ID); + ASSERT(false); // shouldn't get here + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_encrypt_invalid_id); + } + + // insert BlobCipher keys into BlobCipherKeyCache map and validate + TraceEvent("BlobCipherTestInsertKeys").log(); + for (auto& domainItr : domainKeyMap) { + for (auto& baseKeyItr : domainItr.second) { + Reference baseCipher = baseKeyItr.second; + + cipherKeyCache->insertCipherKey(baseCipher->domainId, + baseCipher->keyId, + baseCipher->key.get(), + baseCipher->len, + baseCipher->refreshAt, + baseCipher->expireAt); + Reference fetchedKey = cipherKeyCache->getLatestCipherKey(baseCipher->domainId); + baseCipher->generatedSalt = fetchedKey->getSalt(); + } + } + // insert EncryptHeader BlobCipher key + Reference headerBaseCipher = makeReference( + ENCRYPT_HEADER_DOMAIN_ID, 1, std::numeric_limits::max(), std::numeric_limits::max()); + cipherKeyCache->insertCipherKey(headerBaseCipher->domainId, + headerBaseCipher->keyId, + headerBaseCipher->key.get(), + headerBaseCipher->len, + headerBaseCipher->refreshAt, + headerBaseCipher->expireAt); + + TraceEvent("BlobCipherTestInsertKeysDone").log(); + + // validate the cipherKey lookups work as desired + for (auto& domainItr : domainKeyMap) { + for (auto& baseKeyItr : domainItr.second) { + Reference baseCipher = baseKeyItr.second; + Reference cipherKey = + cipherKeyCache->getCipherKey(baseCipher->domainId, baseCipher->keyId, baseCipher->generatedSalt); + ASSERT(cipherKey.isValid()); + // validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher + ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipher->keyId); + ASSERT_EQ(cipherKey->getDomainId(), baseCipher->domainId); + ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipher->len); + // ensure that baseCipher matches with the cached information + ASSERT_EQ(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0); + // validate the encryption derivation + ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0); + } + } + TraceEvent("BlobCipherTestLooksupDone").log(); + + // Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP + try { + Reference baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId]; + cipherKeyCache->insertCipherKey(baseCipher->domainId, + baseCipher->keyId, + baseCipher->key.get(), + baseCipher->len, + std::numeric_limits::max(), + std::numeric_limits::max()); + } catch (Error& e) { + throw; + } + TraceEvent("BlobCipherTestReinsertIdempotentKeyDone").log(); + + // Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error + try { + Reference baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId]; + uint8_t rawCipher[baseCipher->len]; + memcpy(rawCipher, baseCipher->key.get(), baseCipher->len); + // modify few bytes in the cipherKey + for (int i = 2; i < 5; i++) { + rawCipher[i]++; + } + cipherKeyCache->insertCipherKey(baseCipher->domainId, + baseCipher->keyId, + &rawCipher[0], + baseCipher->len, + std::numeric_limits::max(), + std::numeric_limits::max()); + } catch (Error& e) { + if (e.code() != error_code_encrypt_update_cipher) { + throw; + } + } + TraceEvent("BlobCipherTestReinsertNonIdempotentKeyDone").log(); + + // Validate Encryption ops + Reference cipherKey = cipherKeyCache->getLatestCipherKey(minDomainId); + Reference headerCipherKey = cipherKeyCache->getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID); + const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512; + uint8_t orgData[bufLen]; + deterministicRandom()->randomBytes(&orgData[0], bufLen); + + Arena arena; + uint8_t iv[AES_256_IV_LENGTH]; + deterministicRandom()->randomBytes(&iv[0], AES_256_IV_LENGTH); + + BlobCipherEncryptHeader headerCopy; + // validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE + { + TraceEvent("NoneAuthModeStart"); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, EncryptCipherMode::ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId); + + Reference tCipherKeyKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + ASSERT(tCipherKeyKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), &header.iv[0], BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone"); + + // induce encryption header corruption - headerVersion corrupted + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encrypted buffer payload corruption + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + // No authToken, hence, no corruption detection supported + ASSERT(false); + } + + TraceEvent("NoneAuthModeDone"); + } + + // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE + // HMAC_SHA authToken algorithm + { + TraceEvent("SingleAuthModeHmacShaStart").log(); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_HMAC_SHA_SIZE).toString()); + + Reference tCipherKeyKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + ASSERT(tCipherKeyKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone"); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1); + headerCopy.singleAuthToken.authToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encrypted buffer payload corruption + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("SingleAuthModeHmacShaDone"); + } + // AES_CMAC authToken algorithm + { + TraceEvent("SingleAuthModeAesCMacStart").log(); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_AES_CMAC_SIZE).toString()); + + Reference tCipherKeyKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + ASSERT(tCipherKeyKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1); + headerCopy.singleAuthToken.authToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encrypted buffer payload corruption + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("SingleAuthModeAesCmacDone"); + } + + // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI + // HMAC_SHA authToken algorithm + { + TraceEvent("MultiAuthModeHmacShaStart").log(); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_HMAC_SHA_SIZE).toString()); + + Reference tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + + ASSERT(tCipherKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - cipherText authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1); + headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encryption header corruption - header authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1); + headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("MultiAuthModeHmacShaDone"); + } + // AES_CMAC authToken algorithm + { + TraceEvent("MultiAuthModeAesCmacStart"); + + EncryptBlobCipherAes265Ctr encryptor(cipherKey, + headerCipherKey, + iv, + AES_256_IV_LENGTH, + EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI, + EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, + BlobCipherMetrics::TEST); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); + + TraceEvent("BlobCipherTestEncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode) + .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_AES_CMAC_SIZE).toString()); + + Reference tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId, + header.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + + ASSERT(tCipherKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTestDecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - cipherText authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1); + headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encryption header corruption - header authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1); + headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("MultiAuthModeAesCmacDone"); + } + + // Validate dropping encryptDomainId cached keys + const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId); + cipherKeyCache->resetEncryptDomainId(candidate); + std::vector> cachedKeys = cipherKeyCache->getAllCiphers(candidate); + ASSERT(cachedKeys.empty()); + + // Validate dropping all cached cipherKeys + cipherKeyCache->cleanup(); + for (int dId = minDomainId; dId < maxDomainId; dId++) { + std::vector> cachedKeys = cipherKeyCache->getAllCiphers(dId); + ASSERT(cachedKeys.empty()); + } + + TraceEvent("BlobCipherTestDone"); + return Void(); +} diff --git a/fdbclient/BlobGranuleCommon.cpp b/fdbclient/BlobGranuleCommon.cpp new file mode 100644 index 0000000000..44f32bcb25 --- /dev/null +++ b/fdbclient/BlobGranuleCommon.cpp @@ -0,0 +1,45 @@ +/* + * BlobGranuleCommon.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/BlobGranuleCommon.h" + +BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk) { + BlobGranuleSummaryRef summary; + ASSERT(chunk.snapshotFile.present()); + ASSERT(chunk.snapshotVersion != invalidVersion); + ASSERT(chunk.includedVersion >= chunk.snapshotVersion); + ASSERT(chunk.newDeltas.empty()); + + if (chunk.tenantPrefix.present()) { + summary.keyRange = KeyRangeRef(ar, chunk.keyRange.removePrefix(chunk.tenantPrefix.get())); + } else { + summary.keyRange = KeyRangeRef(ar, chunk.keyRange); + } + + summary.snapshotVersion = chunk.snapshotVersion; + summary.snapshotSize = chunk.snapshotFile.get().length; + summary.deltaVersion = chunk.includedVersion; + summary.deltaSize = 0; + for (auto& it : chunk.deltaFiles) { + summary.deltaSize += it.length; + } + + return summary; +} \ No newline at end of file diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index 4250ee7815..3747824437 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -20,15 +20,19 @@ #include "fdbclient/BlobGranuleFiles.h" +#include "fdbclient/BlobCipher.h" #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/ClientKnobs.h" +#include "fdbclient/CommitTransaction.h" #include "fdbclient/Knobs.h" #include "fdbclient/SystemData.h" // for allKeys unit test - could remove -#include "flow/BlobCipher.h" +#include "flow/Arena.h" #include "flow/CompressionUtils.h" #include "flow/DeterministicRandom.h" +#include "flow/EncryptUtils.h" #include "flow/IRandom.h" +#include "flow/Knobs.h" #include "flow/Trace.h" #include "flow/serialize.h" #include "flow/UnitTest.h" @@ -37,11 +41,12 @@ #include "fmt/format.h" #include +#include // for perf microbenchmark +#include #include #define BG_READ_DEBUG false - -// FIXME: implement actual proper file format for this +#define BG_FILES_TEST_DEBUG false // Implements granule file parsing and materialization with normal c++ functions (non-actors) so that this can be used // outside the FDB network thread. @@ -57,6 +62,111 @@ uint16_t MIN_SUPPORTED_BG_FORMAT_VERSION = 1; const uint8_t SNAPSHOT_FILE_TYPE = 'S'; const uint8_t DELTA_FILE_TYPE = 'D'; +// Deltas in key order + +// For key-ordered delta files, the format for both sets and range clears is that you store boundaries ordered by key. +// Each boundary has a corresponding key, zero or more versioned updates (ValueAndVersionRef), and optionally a clear +// from keyAfter(key) to the next boundary, at a version. +// A streaming merge is more efficient than applying deltas one by one to restore to a later version from the snapshot. +// The concept of this versioned mutation boundaries is repurposed directly from a prior version of redwood, back when +// it supported versioned data. +struct ValueAndVersionRef { + Version version; + MutationRef::Type op; // only set/clear + ValueRef value; // only present for set + + ValueAndVersionRef() {} + // create clear + explicit ValueAndVersionRef(Version version) : version(version), op(MutationRef::Type::ClearRange) {} + // create set + explicit ValueAndVersionRef(Version version, ValueRef value) + : version(version), op(MutationRef::Type::SetValue), value(value) {} + ValueAndVersionRef(Arena& arena, const ValueAndVersionRef& copyFrom) + : version(copyFrom.version), op(copyFrom.op), value(arena, copyFrom.value) {} + + bool isSet() const { return op == MutationRef::SetValue; } + bool isClear() const { return op == MutationRef::ClearRange; } + + int totalSize() const { return sizeof(ValueAndVersionRef) + value.size(); } + int expectedSize() const { return value.size(); } + + struct OrderByVersion { + bool operator()(ValueAndVersionRef const& a, ValueAndVersionRef const& b) const { + return a.version < b.version; + } + }; + + template + void serialize(Ar& ar) { + serializer(ar, version, op, value); + } +}; + +// Effectively the single DeltaBoundaryRef reduced to one update, but also with the key and clear after information. +// Sometimes at a given version, the boundary may only be necessary to represent a clear version after this key, or just +// an update/clear to this key, or both. +struct ParsedDeltaBoundaryRef { + KeyRef key; + MutationRef::Type op; // SetValue, ClearRange, or NoOp + ValueRef value; // null unless op == SetValue + bool clearAfter; + + // op constructor + ParsedDeltaBoundaryRef() {} + explicit ParsedDeltaBoundaryRef(KeyRef key, bool clearAfter, const ValueAndVersionRef& valueAndVersion) + : key(key), op(valueAndVersion.op), value(valueAndVersion.value), clearAfter(clearAfter) {} + // noop constructor + explicit ParsedDeltaBoundaryRef(KeyRef key, bool clearAfter) + : key(key), op(MutationRef::Type::NoOp), clearAfter(clearAfter) {} + // from snapshot set constructor + explicit ParsedDeltaBoundaryRef(const KeyValueRef& kv) + : key(kv.key), op(MutationRef::Type::SetValue), value(kv.value), clearAfter(false) {} + + ParsedDeltaBoundaryRef(Arena& arena, const ParsedDeltaBoundaryRef& copyFrom) + : key(arena, copyFrom.key), op(copyFrom.op), clearAfter(copyFrom.clearAfter) { + if (copyFrom.isSet()) { + value = StringRef(arena, copyFrom.value); + } + } + + bool isSet() const { return op == MutationRef::SetValue; } + bool isClear() const { return op == MutationRef::ClearRange; } + bool isNoOp() const { return op == MutationRef::NoOp; } + bool redundant(bool prevClearAfter) const { return op == MutationRef::Type::NoOp && clearAfter == prevClearAfter; } +}; + +struct DeltaBoundaryRef { + // key + KeyRef key; + // updates to exactly this key + VectorRef values; + // clear version from keyAfter(key) up to the next boundary + Optional clearVersion; + + DeltaBoundaryRef() {} + DeltaBoundaryRef(Arena& ar, const DeltaBoundaryRef& copyFrom) + : key(ar, copyFrom.key), values(ar, copyFrom.values), clearVersion(copyFrom.clearVersion) {} + + int totalSize() { return sizeof(DeltaBoundaryRef) + key.expectedSize() + values.expectedSize(); } + int expectedSize() const { return key.expectedSize() + values.expectedSize(); } + + template + void serialize(Ar& ar) { + serializer(ar, key, values, clearVersion); + } +}; + +struct GranuleSortedDeltas { + constexpr static FileIdentifier file_identifier = 8183903; + + VectorRef boundaries; + + template + void serialize(Ar& ar) { + serializer(ar, boundaries); + } +}; + struct ChildBlockPointerRef { StringRef key; uint32_t offset; @@ -87,16 +197,21 @@ namespace { BlobGranuleFileEncryptionKeys getEncryptBlobCipherKey(const BlobGranuleCipherKeysCtx cipherKeysCtx) { BlobGranuleFileEncryptionKeys eKeys; + // Cipher key reconstructed is 'never' inserted into BlobCipherKey cache, choose 'neverExpire' eKeys.textCipherKey = makeReference(cipherKeysCtx.textCipherKey.encryptDomainId, cipherKeysCtx.textCipherKey.baseCipherId, cipherKeysCtx.textCipherKey.baseCipher.begin(), cipherKeysCtx.textCipherKey.baseCipher.size(), - cipherKeysCtx.textCipherKey.salt); + cipherKeysCtx.textCipherKey.salt, + std::numeric_limits::max(), + std::numeric_limits::max()); eKeys.headerCipherKey = makeReference(cipherKeysCtx.headerCipherKey.encryptDomainId, cipherKeysCtx.headerCipherKey.baseCipherId, cipherKeysCtx.headerCipherKey.baseCipher.begin(), cipherKeysCtx.headerCipherKey.baseCipher.size(), - cipherKeysCtx.headerCipherKey.salt); + cipherKeysCtx.headerCipherKey.salt, + std::numeric_limits::max(), + std::numeric_limits::max()); return eKeys; } @@ -172,12 +287,14 @@ struct IndexBlockRef { TraceEvent(SevDebug, "IndexBlockEncrypt_Before").detail("Chksum", chksum); } - EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey, - eKeys.headerCipherKey, - cipherKeysCtx.ivRef.begin(), - AES_256_IV_LENGTH, - ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); - Value serializedBuff = ObjectWriter::toValue(block, Unversioned()); + EncryptBlobCipherAes265Ctr encryptor( + eKeys.textCipherKey, + eKeys.headerCipherKey, + cipherKeysCtx.ivRef.begin(), + AES_256_IV_LENGTH, + getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), + BlobCipherMetrics::BLOB_GRANULE); + Value serializedBuff = ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); BlobCipherEncryptHeader header; buffer = encryptor.encrypt(serializedBuff.contents().begin(), serializedBuff.contents().size(), &header, arena) ->toStringRef(); @@ -204,7 +321,8 @@ struct IndexBlockRef { validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef); - DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin()); + DecryptBlobCipherAes256Ctr decryptor( + eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin(), BlobCipherMetrics::BLOB_GRANULE); StringRef decrypted = decryptor.decrypt(idxRef.buffer.begin(), idxRef.buffer.size(), header, arena)->toStringRef(); @@ -213,20 +331,22 @@ struct IndexBlockRef { TraceEvent(SevDebug, "IndexBlockEncrypt_After").detail("Chksum", chksum); } - // TODO: Add version? - ObjectReader dataReader(decrypted.begin(), Unversioned()); + ObjectReader dataReader(decrypted.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, idxRef.block, arena); } void init(Optional cipherKeysCtx, Arena& arena) { if (encryptHeaderRef.present()) { + CODE_PROBE(true, "reading encrypted chunked file"); ASSERT(cipherKeysCtx.present()); + decrypt(cipherKeysCtx.get(), *this, arena); } else { - TraceEvent("IndexBlockSize").detail("Sz", buffer.size()); + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent("IndexBlockSize").detail("Sz", buffer.size()); + } - // TODO: Add version? - ObjectReader dataReader(buffer.begin(), Unversioned()); + ObjectReader dataReader(buffer.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, block, arena); } } @@ -242,10 +362,15 @@ struct IndexBlockRef { encrypt(cipherKeysCtx.get(), arena); } else { encryptHeaderRef.reset(); - buffer = StringRef(arena, ObjectWriter::toValue(block, Unversioned()).contents()); + buffer = StringRef( + arena, ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())).contents()); } - TraceEvent(SevDebug, "IndexBlockSize").detail("Sz", buffer.size()).detail("Encrypted", cipherKeysCtx.present()); + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "IndexBlockSize") + .detail("Sz", buffer.size()) + .detail("Encrypted", cipherKeysCtx.present()); + } } template @@ -259,8 +384,8 @@ struct IndexBlockRef { // Encryption: A 'chunk' gets encrypted before getting persisted if enabled. Encryption header is persisted along with // the chunk data to assist decryption on reads. // -// Compression: A 'chunk' gets compressed before getting persisted if enabled. Compression filter (algoritm) infomration -// is persisted as part of 'chunk metadata' to assist decompression on reads. +// Compression: A 'chunk' gets compressed before getting persisted if enabled. Compression filter (algorithm) +// information is persisted as part of 'chunk metadata' to assist decompression on reads. struct IndexBlobGranuleFileChunkRef { constexpr static FileIdentifier file_identifier = 2814019; @@ -286,11 +411,13 @@ struct IndexBlobGranuleFileChunkRef { TraceEvent(SevDebug, "BlobChunkEncrypt_Before").detail("Chksum", chksum); } - EncryptBlobCipherAes265Ctr encryptor(eKeys.textCipherKey, - eKeys.headerCipherKey, - cipherKeysCtx.ivRef.begin(), - AES_256_IV_LENGTH, - ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + EncryptBlobCipherAes265Ctr encryptor( + eKeys.textCipherKey, + eKeys.headerCipherKey, + cipherKeysCtx.ivRef.begin(), + AES_256_IV_LENGTH, + getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), + BlobCipherMetrics::BLOB_GRANULE); BlobCipherEncryptHeader header; chunkRef.buffer = encryptor.encrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), &header, arena)->toStringRef(); @@ -319,7 +446,8 @@ struct IndexBlobGranuleFileChunkRef { validateEncryptionHeaderDetails(eKeys, header, cipherKeysCtx.ivRef); - DecryptBlobCipherAes256Ctr decryptor(eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin()); + DecryptBlobCipherAes256Ctr decryptor( + eKeys.textCipherKey, eKeys.headerCipherKey, cipherKeysCtx.ivRef.begin(), BlobCipherMetrics::BLOB_GRANULE); StringRef decrypted = decryptor.decrypt(chunkRef.buffer.begin(), chunkRef.buffer.size(), header, arena)->toStringRef(); @@ -336,7 +464,10 @@ struct IndexBlobGranuleFileChunkRef { const CompressionFilter compFilter, Arena& arena) { chunkRef.compressionFilter = compFilter; - chunkRef.buffer = CompressionUtils::compress(chunkRef.compressionFilter.get(), chunk.contents(), arena); + chunkRef.buffer = CompressionUtils::compress(chunkRef.compressionFilter.get(), + chunk.contents(), + CompressionUtils::getDefaultCompressionLevel(compFilter), + arena); if (BG_ENCRYPT_COMPRESS_DEBUG) { XXH64_hash_t chunkChksum = XXH3_64bits(chunk.contents().begin(), chunk.contents().size()); @@ -378,19 +509,18 @@ struct IndexBlobGranuleFileChunkRef { : CompressionUtils::toString(CompressionFilter::NONE)); } - // TODO: Add version? - return ObjectWriter::toValue(chunkRef, Unversioned()); + return ObjectWriter::toValue(chunkRef, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); } static IndexBlobGranuleFileChunkRef fromBytes(Optional cipherKeysCtx, StringRef buffer, Arena& arena) { IndexBlobGranuleFileChunkRef chunkRef; - // TODO: Add version? - ObjectReader dataReader(buffer.begin(), Unversioned()); + ObjectReader dataReader(buffer.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, chunkRef, arena); if (chunkRef.encryptHeaderRef.present()) { + CODE_PROBE(true, "reading encrypted file chunk"); ASSERT(cipherKeysCtx.present()); chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decrypt(cipherKeysCtx.get(), chunkRef, arena); } else { @@ -398,6 +528,7 @@ struct IndexBlobGranuleFileChunkRef { } if (chunkRef.compressionFilter.present()) { + CODE_PROBE(true, "reading compressed file chunk"); chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decompress(chunkRef, arena); } else if (!chunkRef.chunkBytes.present()) { // 'Encryption' & 'Compression' aren't enabled. @@ -441,9 +572,9 @@ struct IndexedBlobGranuleFile { // Non-serialized member fields StringRef fileBytes; - void init(const Optional cipherKeysCtx) { + void init(uint8_t fType, const Optional cipherKeysCtx) { formatVersion = LATEST_BG_FORMAT_VERSION; - fileType = SNAPSHOT_FILE_TYPE; + fileType = fType; chunkStartOffset = -1; } @@ -459,8 +590,7 @@ struct IndexedBlobGranuleFile { // parse index block at head of file Arena arena; IndexedBlobGranuleFile file; - // TODO: version? - ObjectReader dataReader(fileBytes.begin(), Unversioned()); + ObjectReader dataReader(fileBytes.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, file, arena); file.init(fileBytes, arena, cipherKeysCtx); @@ -521,8 +651,7 @@ struct IndexedBlobGranuleFile { IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena); ChildType child; - // TODO: version? - ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), Unversioned()); + ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, child, childArena); // TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused? @@ -542,29 +671,68 @@ struct IndexedBlobGranuleFile { Value serializeIndexBlock(Standalone& file, Optional cipherKeysCtx) { file.indexBlockRef.finalize(cipherKeysCtx, file.arena()); - // TODO: version? - Value serialized = ObjectWriter::toValue(file, Unversioned()); + Value serialized = ObjectWriter::toValue(file, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); file.chunkStartOffset = serialized.contents().size(); if (BG_ENCRYPT_COMPRESS_DEBUG) { TraceEvent(SevDebug, "SerializeIndexBlock").detail("StartOffset", file.chunkStartOffset); } - return ObjectWriter::toValue(file, Unversioned()); + return ObjectWriter::toValue(file, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); } -// TODO: this should probably be in actor file with yields? +Value serializeFileFromChunks(Standalone& file, + Optional cipherKeysCtx, + std::vector& chunks, + int previousChunkBytes) { + Value indexBlockBytes = serializeIndexBlock(file, cipherKeysCtx); + int32_t indexSize = indexBlockBytes.size(); + chunks[0] = indexBlockBytes; + + // TODO: write this directly to stream to avoid extra copy? + Arena ret; + + size_t size = indexSize + previousChunkBytes; + uint8_t* buffer = new (ret) uint8_t[size]; + uint8_t* bufferStart = buffer; + + int idx = 0; + for (auto& it : chunks) { + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "SerializeFile") + .detail("ChunkIdx", idx++) + .detail("Size", it.size()) + .detail("Offset", buffer - bufferStart); + } + buffer = it.copyTo(buffer); + } + ASSERT(size == buffer - bufferStart); + + return Standalone(StringRef(bufferStart, size), ret); +} + +// TODO: this should probably be in actor file with yields? - move writing logic to separate actor file in server? // TODO: optimize memory copying // TODO: sanity check no oversized files -Value serializeChunkedSnapshot(Standalone snapshot, - int chunkCount, +Value serializeChunkedSnapshot(const Standalone& fileNameRef, + const Standalone& snapshot, + int targetChunkBytes, Optional compressFilter, Optional cipherKeysCtx) { + + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "SerializeChunkedSnapshot") + .detail("FileName", fileNameRef.toString()) + .detail("Encrypted", cipherKeysCtx.present()) + .detail("Compressed", compressFilter.present()); + } + + CODE_PROBE(compressFilter.present(), "serializing compressed snapshot file"); + CODE_PROBE(cipherKeysCtx.present(), "serializing encrypted snapshot file"); Standalone file; - file.init(cipherKeysCtx); + file.init(SNAPSHOT_FILE_TYPE, cipherKeysCtx); - size_t targetChunkBytes = snapshot.expectedSize() / chunkCount; size_t currentChunkBytesEstimate = 0; size_t previousChunkBytes = 0; @@ -572,7 +740,6 @@ Value serializeChunkedSnapshot(Standalone snapshot, chunks.push_back(Value()); // dummy value for index block Standalone currentChunk; - // fmt::print("Chunk index:\n"); for (int i = 0; i < snapshot.size(); i++) { // TODO REMOVE sanity check if (i > 0) { @@ -583,8 +750,8 @@ Value serializeChunkedSnapshot(Standalone snapshot, currentChunkBytesEstimate += snapshot[i].expectedSize(); if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) { - // TODO: protocol version - Value serialized = ObjectWriter::toValue(currentChunk, Unversioned()); + Value serialized = + ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); Value chunkBytes = IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena()); chunks.push_back(chunkBytes); @@ -613,40 +780,24 @@ Value serializeChunkedSnapshot(Standalone snapshot, file.arena(), keyAfter(snapshot.back().key), previousChunkBytes); } - Value indexBlockBytes = serializeIndexBlock(file, cipherKeysCtx); - int32_t indexSize = indexBlockBytes.size(); - chunks[0] = indexBlockBytes; - - // TODO: write this directly to stream to avoid extra copy? - Arena ret; - - size_t size = indexSize + previousChunkBytes; - uint8_t* buffer = new (ret) uint8_t[size]; - - previousChunkBytes = 0; - int idx = 0; - for (auto& it : chunks) { - if (BG_ENCRYPT_COMPRESS_DEBUG) { - TraceEvent(SevDebug, "SerializeSnapshot") - .detail("ChunkIdx", idx++) - .detail("Size", it.size()) - .detail("Offset", previousChunkBytes); - } - - memcpy(buffer + previousChunkBytes, it.begin(), it.size()); - previousChunkBytes += it.size(); - } - ASSERT(size == previousChunkBytes); - - return Standalone(StringRef(buffer, size), ret); + return serializeFileFromChunks(file, cipherKeysCtx, chunks, previousChunkBytes); } // TODO: use redwood prefix trick to optimize cpu comparison -static Arena loadSnapshotFile(const StringRef& snapshotData, - KeyRangeRef keyRange, - std::map& dataMap, - Optional cipherKeysCtx) { - Arena rootArena; +static Standalone> loadSnapshotFile( + const Standalone& fileName, + const StringRef& snapshotData, + const KeyRangeRef& keyRange, + Optional cipherKeysCtx) { + Standalone> results; + + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "LoadChunkedSnapshot") + .detail("FileName", fileName.toString()) + .detail("RangeBegin", keyRange.begin.printable()) + .detail("RangeEnd", keyRange.end.printable()) + .detail("Encrypted", cipherKeysCtx.present()); + } Standalone file = IndexedBlobGranuleFile::fromFileBytes(snapshotData, cipherKeysCtx); @@ -655,21 +806,25 @@ static Arena loadSnapshotFile(const StringRef& snapshotData, // empty snapshot file if (file.indexBlockRef.block.children.empty()) { - return rootArena; + return results; } ASSERT(file.indexBlockRef.block.children.size() >= 2); - // TODO: refactor this out of delta tree - // int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first, - // index.dataBlockOffsets.back().first); - // find range of blocks needed to read ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); - // FIXME: optimize cpu comparisons here in first/last partial blocks, doing entire blocks at once based on - // comparison, and using shared prefix for key comparison - while (currentBlock != (file.indexBlockRef.block.children.end() - 1) && keyRange.end > currentBlock->key) { + if (currentBlock == (file.indexBlockRef.block.children.end() - 1) || keyRange.end <= currentBlock->key) { + return results; + } + + bool lastBlock = false; + + // FIXME: shared prefix for key comparison + while (!lastBlock) { + auto nextBlock = currentBlock; + nextBlock++; + lastBlock = (nextBlock == (file.indexBlockRef.block.children.end() - 1)) || (keyRange.end <= nextBlock->key); Standalone dataBlock = file.getChild(currentBlock, cipherKeysCtx, file.chunkStartOffset); ASSERT(!dataBlock.empty()); @@ -677,21 +832,332 @@ static Arena loadSnapshotFile(const StringRef& snapshotData, bool anyRows = false; for (auto& entry : dataBlock) { - if (entry.key >= keyRange.begin && entry.key < keyRange.end) { - dataMap.insert({ entry.key, entry.value }); + if (!results.empty() && !lastBlock) { + // no key comparisons needed + results.emplace_back(results.arena(), entry); anyRows = true; + } else if ((!results.empty() || entry.key >= keyRange.begin) && (!lastBlock || entry.key < keyRange.end)) { + results.emplace_back(results.arena(), entry); + anyRows = true; + } else if (!results.empty() && lastBlock) { + break; } } if (anyRows) { - rootArena.dependsOn(dataBlock.arena()); + results.arena().dependsOn(dataBlock.arena()); } currentBlock++; } - return rootArena; + return results; } -static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map& dataMap) { +typedef std::map> SortedDeltasT; + +// FIXME: optimize all of this with common prefix comparison stuff +SortedDeltasT::iterator insertMutationBoundary(SortedDeltasT& deltasByKey, const KeyRef& boundary) { + // Find the first split point in buffer that is >= key + auto it = deltasByKey.lower_bound(boundary); + + // Since the map contains fileRange already, we had to have found something + ASSERT(it != deltasByKey.end()); + if (it->first == boundary) { + return it; + } + + // new boundary, using find as insert hint + it = deltasByKey.insert(it, { boundary, Standalone() }); + + // look back at previous entry to see if this boundary is already cleared to at a prior version + ASSERT(it != deltasByKey.begin()); + auto itPrev = it; + --itPrev; + + if (itPrev->second.clearVersion.present()) { + it->second.clearVersion = itPrev->second.clearVersion; + it->second.values.push_back(it->second.arena(), ValueAndVersionRef(it->second.clearVersion.get())); + } + + return it; +} + +void updateMutationBoundary(Standalone& boundary, const ValueAndVersionRef& update) { + if (update.isSet()) { + if (boundary.values.empty() || boundary.values.back().version < update.version) { + // duplicate same set even if it's the same as the last one, so beginVersion reads still get updates + boundary.values.push_back(boundary.arena(), update); + } else { + CODE_PROBE(true, "multiple boundary updates at same version (set)"); + // preserve inter-mutation order by replacing this one + boundary.values.back() = update; + } + } else { + if (boundary.values.empty() || + (boundary.values.back().isSet() && boundary.values.back().version < update.version)) { + // don't duplicate single-key clears in order if previous was also a clear, since it's a no-op when starting + // with beginVersion + boundary.values.push_back(boundary.arena(), update); + } else if (!boundary.values.empty() && boundary.values.back().version == update.version) { + CODE_PROBE(true, "multiple boundary updates at same version (clear)"); + if (boundary.values.back().isSet()) { + // if the last 2 updates were clear @ v1 and set @ v2, and we now have a clear at v2, just pop off the + // set and leave the previous clear. Otherwise, just set the last set to a clear + if (boundary.values.size() >= 2 && boundary.values[boundary.values.size() - 2].isClear()) { + CODE_PROBE(true, "clear then set/clear at same version optimization"); + boundary.values.pop_back(); + } else { + boundary.values.back() = update; + } + } // else we have 2 consecutive clears at this version, no-op + } + } +} + +void insertSortedDelta(const MutationRef& m, + const Version version, + const KeyRangeRef& fileRange, + SortedDeltasT& deltasByKey) { + // TODO REMOVE validation + ASSERT(fileRange.contains(m.param1)); + if (m.type == MutationRef::ClearRange) { + ASSERT(m.param2 <= fileRange.end); + // handle single key clear more efficiently + if (equalsKeyAfter(m.param1, m.param2)) { + SortedDeltasT::iterator key = insertMutationBoundary(deltasByKey, m.param1); + updateMutationBoundary(key->second, ValueAndVersionRef(version)); + } else { + // Update each boundary in the cleared range + SortedDeltasT::iterator begin = insertMutationBoundary(deltasByKey, m.param1); + SortedDeltasT::iterator end = insertMutationBoundary(deltasByKey, m.param2); + while (begin != end) { + // Set the rangeClearedVersion if not set + if (!begin->second.clearVersion.present()) { + begin->second.clearVersion = version; + } + + // Add a clear to values if it's empty or the last item is not a clear + if (begin->second.values.empty() || begin->second.values.back().isSet()) { + updateMutationBoundary(begin->second, ValueAndVersionRef(version)); + } + ++begin; + } + } + } else { + Standalone& bound = insertMutationBoundary(deltasByKey, m.param1)->second; + updateMutationBoundary(bound, ValueAndVersionRef(version, m.param2)); + } +} + +// TODO: investigate more cpu-efficient sorting methods. Potential options: +// 1) Replace std::map with ART mutation buffer +// 2) sort updates and clear endpoints by (key, version), and keep track of active clears. +void sortDeltasByKey(const Standalone& deltasByVersion, + const KeyRangeRef& fileRange, + SortedDeltasT& deltasByKey) { + if (deltasByVersion.empty()) { + return; + } + if (deltasByKey.empty()) { + deltasByKey.insert({ fileRange.begin, Standalone() }); + deltasByKey.insert({ fileRange.end, Standalone() }); + } + for (auto& it : deltasByVersion) { + for (auto& m : it.mutations) { + insertSortedDelta(m, it.version, fileRange, deltasByKey); + } + } + + // TODO: could do a scan through map and coalesce clears (if any boundaries with exactly 1 mutation (clear) and same + // clearVersion as previous guy) +} + +// FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking +Value serializeChunkedDeltaFile(const Standalone& fileNameRef, + const Standalone& deltas, + const KeyRangeRef& fileRange, + int chunkSize, + Optional compressFilter, + Optional cipherKeysCtx) { + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "SerializeChunkedDelta") + .detail("Filename", fileNameRef.toString()) + .detail("RangeBegin", fileRange.begin.printable()) + .detail("RangeEnd", fileRange.end.printable()) + .detail("Encrypted", cipherKeysCtx.present()) + .detail("Compressed", compressFilter.present()); + } + + CODE_PROBE(compressFilter.present(), "serializing compressed delta file"); + CODE_PROBE(cipherKeysCtx.present(), "serializing encrypted delta file"); + Standalone file; + + file.init(DELTA_FILE_TYPE, cipherKeysCtx); + + // build in-memory version of boundaries - TODO separate functions + SortedDeltasT boundaries; + sortDeltasByKey(deltas, fileRange, boundaries); + + std::vector chunks; + chunks.push_back(Value()); // dummy value for index block + + Standalone currentChunk; + size_t currentChunkBytesEstimate = 0; + size_t previousChunkBytes = 0; + + // TODO REMOVE - for validation + KeyRef lastKey; + int i = 0; + for (auto& it : boundaries) { + // TODO REMOVE sanity check + if (i > 0) { + ASSERT(lastKey < it.first); + } + lastKey = it.first; + it.second.key = it.first; + + currentChunk.boundaries.push_back_deep(currentChunk.arena(), it.second); + currentChunkBytesEstimate += it.second.totalSize(); + + if (currentChunkBytesEstimate >= chunkSize || i == boundaries.size() - 1) { + Value serialized = + ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); + Value chunkBytes = + IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena()); + chunks.push_back(chunkBytes); + + // TODO remove validation + if (!file.indexBlockRef.block.children.empty()) { + ASSERT(file.indexBlockRef.block.children.back().key < currentChunk.boundaries.begin()->key); + } + file.indexBlockRef.block.children.emplace_back_deep( + file.arena(), currentChunk.boundaries.begin()->key, previousChunkBytes); + + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "ChunkSize") + .detail("ChunkBytes", chunkBytes.size()) + .detail("PrvChunkBytes", previousChunkBytes); + } + + previousChunkBytes += chunkBytes.size(); + currentChunkBytesEstimate = 0; + currentChunk = Standalone(); + } + i++; + } + ASSERT(currentChunk.boundaries.empty()); + if (!deltas.empty()) { + file.indexBlockRef.block.children.emplace_back_deep(file.arena(), fileRange.end, previousChunkBytes); + } + + return serializeFileFromChunks(file, cipherKeysCtx, chunks, previousChunkBytes); +} + +ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beginVersion, Version readVersion) { + bool clearAfter = delta.clearVersion.present() && readVersion >= delta.clearVersion.get() && + beginVersion <= delta.clearVersion.get(); + if (delta.values.empty()) { + return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } + auto valueAtVersion = std::lower_bound(delta.values.begin(), + delta.values.end(), + ValueAndVersionRef(readVersion), + ValueAndVersionRef::OrderByVersion()); + if (valueAtVersion == delta.values.begin() && readVersion < valueAtVersion->version) { + // deltas are all higher than read version + return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } + // lower_bound() found version >= readVersion, so if we're at the end or it's not equal, go back one + if (valueAtVersion == delta.values.end() || valueAtVersion->version > readVersion) { + valueAtVersion--; + } + ASSERT(readVersion >= valueAtVersion->version); + // now, handle beginVersion (if update < beginVersion, it's a noop) + if (valueAtVersion->version < beginVersion) { + return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } else { + return ParsedDeltaBoundaryRef(delta.key, clearAfter, *valueAtVersion); + } +} + +// The arena owns the BoundaryDeltaRef struct data but the StringRef pointers point to data in deltaData, to avoid extra +// copying +Standalone> loadChunkedDeltaFile(const Standalone& fileNameRef, + const StringRef& deltaData, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + Optional cipherKeysCtx, + bool& startClear) { + Standalone> deltas; + Standalone file = IndexedBlobGranuleFile::fromFileBytes(deltaData, cipherKeysCtx); + + ASSERT(file.fileType == DELTA_FILE_TYPE); + ASSERT(file.chunkStartOffset > 0); + + // empty delta file + if (file.indexBlockRef.block.children.empty()) { + return deltas; + } + + ASSERT(file.indexBlockRef.block.children.size() >= 2); + + // find range of blocks needed to read + ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); + + if (currentBlock == (file.indexBlockRef.block.children.end() - 1) || keyRange.end <= currentBlock->key) { + // empty, done + return deltas; + } + + // FIXME: shared prefix for key comparison + // FIXME: could cpu optimize first block a bit more by seeking right to start + bool lastBlock = false; + bool prevClearAfter = false; + while (!lastBlock) { + auto nextBlock = currentBlock; + nextBlock++; + lastBlock = (nextBlock == file.indexBlockRef.block.children.end() - 1) || keyRange.end <= nextBlock->key; + + Standalone deltaBlock = + file.getChild(currentBlock, cipherKeysCtx, file.chunkStartOffset); + ASSERT(!deltaBlock.boundaries.empty()); + ASSERT(currentBlock->key == deltaBlock.boundaries.front().key); + + // TODO refactor this into function to share with memory deltas + bool blockMemoryUsed = false; + + for (auto& entry : deltaBlock.boundaries) { + ParsedDeltaBoundaryRef boundary = deltaAtVersion(entry, beginVersion, readVersion); + if (deltas.empty() && entry.key < keyRange.begin) { + startClear = boundary.clearAfter; + prevClearAfter = boundary.clearAfter; + } else if (!lastBlock || entry.key < keyRange.end) { + if (!boundary.redundant(prevClearAfter)) { + deltas.push_back(deltas.arena(), boundary); + blockMemoryUsed = true; + prevClearAfter = boundary.clearAfter; + } + } else { + // TODO REMOVE validation + ASSERT(lastBlock); + break; + } + } + if (blockMemoryUsed) { + deltas.arena().dependsOn(deltaBlock.arena()); + } + currentBlock++; + } + + // TODO REMOVE eventually? order sanity check for parsed deltas + for (int i = 0; i < deltas.size() - 1; i++) { + ASSERT(deltas[i].key < deltas[i + 1].key); + } + + return deltas; +} + +static void applyDelta(const KeyRangeRef& keyRange, const MutationRef& m, std::map& dataMap) { if (m.type == MutationRef::ClearRange) { if (m.param2 <= keyRange.begin || m.param1 >= keyRange.end) { return; @@ -728,12 +1194,12 @@ static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map& dataMap) { +static void applyDeltasByVersion(const GranuleDeltas& deltas, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + Version& lastFileEndVersion, + std::map& dataMap) { if (deltas.empty()) { return; } @@ -768,32 +1234,186 @@ static void applyDeltas(const GranuleDeltas& deltas, lastFileEndVersion = deltas.back().version; } -static Arena loadDeltaFile(StringRef deltaData, - KeyRangeRef keyRange, - Version beginVersion, - Version readVersion, - Version& lastFileEndVersion, - std::map& dataMap) { - Arena parseArena; - GranuleDeltas deltas; - ObjectReader reader(deltaData.begin(), Unversioned()); - reader.deserialize(FileIdentifierFor::value, deltas, parseArena); +// TODO: could optimize this slightly to avoid tracking multiple updates for the same key at all since it's always then +// collapsed to the last one +Standalone> sortMemoryDeltas(const GranuleDeltas& memoryDeltas, + const KeyRangeRef& granuleRange, + const KeyRangeRef& readRange, + Version beginVersion, + Version readVersion) { + ASSERT(!memoryDeltas.empty()); - if (BG_READ_DEBUG) { - fmt::print("Parsed {} deltas from file\n", deltas.size()); + // filter by request range first + SortedDeltasT versionedBoundaries; + if (versionedBoundaries.empty()) { + versionedBoundaries.insert({ readRange.begin, Standalone() }); + versionedBoundaries.insert({ readRange.end, Standalone() }); } - - // TODO REMOVE sanity check - for (int i = 0; i < deltas.size() - 1; i++) { - if (deltas[i].version > deltas[i + 1].version) { - fmt::print( - "BG VERSION ORDER VIOLATION IN DELTA FILE: '{0}', '{1}'\n", deltas[i].version, deltas[i + 1].version); + for (auto& it : memoryDeltas) { + for (auto& m : it.mutations) { + if (m.type == MutationRef::ClearRange) { + if (m.param2 > readRange.begin && m.param1 < readRange.end) { + KeyRangeRef clearRangeClipped = readRange & KeyRangeRef(m.param1, m.param2); + MutationRef clearClipped( + MutationRef::Type::ClearRange, clearRangeClipped.begin, clearRangeClipped.end); + insertSortedDelta(clearClipped, it.version, granuleRange, versionedBoundaries); + } + } else { + ASSERT(m.type == MutationRef::SetValue); + if (readRange.contains(m.param1)) { + insertSortedDelta(m, it.version, granuleRange, versionedBoundaries); + } + } } - ASSERT(deltas[i].version <= deltas[i + 1].version); } - applyDeltas(deltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap); - return parseArena; + // parse and collapse based on version + bool prevClearAfter = false; + Standalone> deltas; + + // remove extra ranges inserted from clears that partially overlap read range + auto itBegin = versionedBoundaries.begin(); + while (itBegin->first < readRange.begin) { + ++itBegin; + } + auto itEnd = versionedBoundaries.end(); + itEnd--; + while (itEnd->first > readRange.end) { + itEnd--; + } + itEnd++; + + while (itBegin != itEnd) { + itBegin->second.key = itBegin->first; + ParsedDeltaBoundaryRef boundary = deltaAtVersion(itBegin->second, beginVersion, readVersion); + if (!boundary.redundant(prevClearAfter)) { + deltas.push_back_deep(deltas.arena(), boundary); + prevClearAfter = boundary.clearAfter; + } + ++itBegin; + } + + return deltas; +} + +// does a sorted merge of the delta streams. +// In terms of write precedence, streams[i] < streams[i+1] +// Handles range clears by tracking the active clears when they start +struct MergeStreamNext { + KeyRef key; + int16_t streamIdx; + int dataIdx; +}; + +// the sort order is logically lower by key, and then higher by streamIdx +// because a priority queue is backwards, we invert that +struct OrderForPriorityQueue { + int commonPrefixLen; + OrderForPriorityQueue(int commonPrefixLen) : commonPrefixLen(commonPrefixLen) {} + + bool operator()(MergeStreamNext const& a, MergeStreamNext const& b) const { + int keyCmp = a.key.compareSuffix(b.key, commonPrefixLen); + if (keyCmp != 0) { + return keyCmp > 0; // reverse + } + return a.streamIdx < b.streamIdx; + } +}; + +typedef std::priority_queue, OrderForPriorityQueue> MergePQ; + +static RangeResult mergeDeltaStreams(const BlobGranuleChunkRef& chunk, + const std::vector>>& streams, + const std::vector startClears) { + ASSERT(streams.size() < std::numeric_limits::max()); + ASSERT(startClears.size() == streams.size()); + + int prefixLen = commonPrefixLength(chunk.keyRange.begin, chunk.keyRange.end); + + // next element for each stream + MergePQ next = MergePQ(OrderForPriorityQueue(prefixLen)); + + // efficiently find the highest stream's active clear + std::set> activeClears; + int16_t maxActiveClear = -1; + + // check if a given stream is actively clearing + bool clearActive[streams.size()]; + for (int16_t i = 0; i < streams.size(); i++) { + clearActive[i] = startClears[i]; + if (startClears[i]) { + activeClears.insert(i); + maxActiveClear = i; + } + if (streams[i].empty()) { + // single clear that entirely encases partial read bounds + ASSERT(clearActive[i]); + } else { + MergeStreamNext item; + item.key = streams[i][0].key; + item.streamIdx = i; + item.dataIdx = 0; + next.push(item); + } + } + + RangeResult result; + std::vector cur; + cur.reserve(streams.size()); + while (!next.empty()) { + cur.clear(); + cur.push_back(next.top()); + next.pop(); + + // next.top().key == cur.front().key but with suffix comparison + while (!next.empty() && cur.front().key.compareSuffix(next.top().key, prefixLen) == 0) { + cur.push_back(next.top()); + next.pop(); + } + + // un-set clears and find latest value for key (if present) + bool foundValue = false; + for (auto& it : cur) { + auto& v = streams[it.streamIdx][it.dataIdx]; + if (clearActive[it.streamIdx]) { + clearActive[it.streamIdx] = false; + activeClears.erase(it.streamIdx); + if (it.streamIdx == maxActiveClear) { + // re-get max active clear + maxActiveClear = activeClears.empty() ? -1 : *activeClears.begin(); + } + } + + // find value for this key (if any) + if (!foundValue && !v.isNoOp()) { + foundValue = true; + // if it's a clear, or maxActiveClear is higher, no value for this key + if (v.isSet() && maxActiveClear < it.streamIdx) { + KeyRef finalKey = + chunk.tenantPrefix.present() ? v.key.removePrefix(chunk.tenantPrefix.get()) : v.key; + result.push_back_deep(result.arena(), KeyValueRef(finalKey, v.value)); + } + } + } + + // advance streams and start clearAfter + for (auto& it : cur) { + if (streams[it.streamIdx][it.dataIdx].clearAfter) { + clearActive[it.streamIdx] = true; + activeClears.insert(it.streamIdx); + maxActiveClear = std::max(maxActiveClear, it.streamIdx); + } + // TODO: implement skipping if large clear!! + // if (maxClearIdx > it.streamIdx) - skip + it.dataIdx++; + if (it.dataIdx < streams[it.streamIdx].size()) { + it.key = streams[it.streamIdx][it.dataIdx].key; + next.push(it); + } + } + } + + return result; } RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, @@ -811,8 +1431,6 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, // FIXME: probably some threshold of a small percentage of the data is actually changed, where it makes sense to // just to dependsOn instead of copy, to use a little extra memory footprint to help cpu? Arena arena; - std::map dataMap; - Version lastFileEndVersion = invalidVersion; KeyRange requestRange; if (chunk.tenantPrefix.present()) { requestRange = keyRange.withPrefix(chunk.tenantPrefix.get()); @@ -820,41 +1438,79 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, requestRange = keyRange; } + std::vector>> streams; + std::vector startClears; + // +1 for possible snapshot, +1 for possible memory deltas + streams.reserve(chunk.deltaFiles.size() + 2); + if (snapshotData.present()) { - Arena snapshotArena = loadSnapshotFile(snapshotData.get(), requestRange, dataMap, chunk.cipherKeysCtx); - arena.dependsOn(snapshotArena); + ASSERT(chunk.snapshotFile.present()); + Standalone> snapshotRows = + loadSnapshotFile(chunk.snapshotFile.get().filename, + snapshotData.get(), + requestRange, + chunk.snapshotFile.get().cipherKeysCtx); + if (!snapshotRows.empty()) { + streams.push_back(snapshotRows); + startClears.push_back(false); + arena.dependsOn(streams.back().arena()); + } } if (BG_READ_DEBUG) { fmt::print("Applying {} delta files\n", chunk.deltaFiles.size()); } for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) { - Arena deltaArena = loadDeltaFile( - deltaFileData[deltaIdx], requestRange, beginVersion, readVersion, lastFileEndVersion, dataMap); - arena.dependsOn(deltaArena); + bool startClear = false; + auto deltaRows = loadChunkedDeltaFile(chunk.deltaFiles[deltaIdx].filename, + deltaFileData[deltaIdx], + requestRange, + beginVersion, + readVersion, + chunk.deltaFiles[deltaIdx].cipherKeysCtx, + startClear); + if (startClear || !deltaRows.empty()) { + streams.push_back(deltaRows); + startClears.push_back(startClear); + arena.dependsOn(streams.back().arena()); + } + arena.dependsOn(deltaRows.arena()); } if (BG_READ_DEBUG) { fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size()); } - applyDeltas(chunk.newDeltas, requestRange, beginVersion, readVersion, lastFileEndVersion, dataMap); - - RangeResult ret; - for (auto& it : dataMap) { - ret.push_back_deep( - ret.arena(), - KeyValueRef(chunk.tenantPrefix.present() ? it.first.removePrefix(chunk.tenantPrefix.get()) : it.first, - it.second)); + if (!chunk.newDeltas.empty()) { + // TODO REMOVE validation + ASSERT(beginVersion <= chunk.newDeltas.front().version); + ASSERT(readVersion >= chunk.newDeltas.back().version); + auto memoryRows = sortMemoryDeltas(chunk.newDeltas, chunk.keyRange, requestRange, beginVersion, readVersion); + if (!memoryRows.empty()) { + streams.push_back(memoryRows); + startClears.push_back(false); + arena.dependsOn(streams.back().arena()); + } } - return ret; + return mergeDeltaStreams(chunk, streams, startClears); } +struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted { + const ReadBlobGranuleContext* granuleContext; + int64_t loadId; + + GranuleLoadFreeHandle(const ReadBlobGranuleContext* granuleContext, int64_t loadId) + : granuleContext(granuleContext), loadId(loadId) {} + + ~GranuleLoadFreeHandle() { granuleContext->free_load_f(loadId, granuleContext->userContext); } +}; + struct GranuleLoadIds { Optional snapshotId; std::vector deltaIds; + std::vector> freeHandles; }; -static void startLoad(const ReadBlobGranuleContext granuleContext, +static void startLoad(const ReadBlobGranuleContext* granuleContext, const BlobGranuleChunkRef& chunk, GranuleLoadIds& loadIds) { @@ -864,12 +1520,13 @@ static void startLoad(const ReadBlobGranuleContext granuleContext, // FIXME: remove when we implement file multiplexing ASSERT(chunk.snapshotFile.get().offset == 0); ASSERT(chunk.snapshotFile.get().length == chunk.snapshotFile.get().fullFileLength); - loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(), - snapshotFname.size(), - chunk.snapshotFile.get().offset, - chunk.snapshotFile.get().length, - chunk.snapshotFile.get().fullFileLength, - granuleContext.userContext); + loadIds.snapshotId = granuleContext->start_load_f(snapshotFname.c_str(), + snapshotFname.size(), + chunk.snapshotFile.get().offset, + chunk.snapshotFile.get().length, + chunk.snapshotFile.get().fullFileLength, + granuleContext->userContext); + loadIds.freeHandles.push_back(makeReference(granuleContext, loadIds.snapshotId.get())); } loadIds.deltaIds.reserve(chunk.deltaFiles.size()); for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) { @@ -877,13 +1534,14 @@ static void startLoad(const ReadBlobGranuleContext granuleContext, // FIXME: remove when we implement file multiplexing ASSERT(chunk.deltaFiles[deltaFileIdx].offset == 0); ASSERT(chunk.deltaFiles[deltaFileIdx].length == chunk.deltaFiles[deltaFileIdx].fullFileLength); - int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(), - deltaFName.size(), - chunk.deltaFiles[deltaFileIdx].offset, - chunk.deltaFiles[deltaFileIdx].length, - chunk.deltaFiles[deltaFileIdx].fullFileLength, - granuleContext.userContext); + int64_t deltaLoadId = granuleContext->start_load_f(deltaFName.c_str(), + deltaFName.size(), + chunk.deltaFiles[deltaFileIdx].offset, + chunk.deltaFiles[deltaFileIdx].length, + chunk.deltaFiles[deltaFileIdx].fullFileLength, + granuleContext->userContext); loadIds.deltaIds.push_back(deltaLoadId); + loadIds.freeHandles.push_back(makeReference(granuleContext, deltaLoadId)); } } @@ -891,7 +1549,8 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone loadAndMaterializeBlobGranules(const Standalone 1 - for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { - startLoad(granuleContext, files[i], loadIds[i]); - } + int64_t inputBytes = 0; + int64_t outputBytes = 0; try { + // Kick off first file reads if parallelism > 1 + for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { + startLoad(&granuleContext, files[i], loadIds[i]); + } RangeResult results; for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) { // Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1 if (chunkIdx + parallelism - 1 < files.size()) { - startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); + startLoad(&granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); } RangeResult chunkRows; @@ -926,9 +1586,11 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(blob_granule_file_load_error()); } + inputBytes += snapshotData.get().size(); } - StringRef deltaData[files[chunkIdx].deltaFiles.size()]; + // +1 to avoid UBSAN variable length array of size zero + StringRef deltaData[files[chunkIdx].deltaFiles.size() + 1]; for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) { deltaData[i] = StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext), @@ -937,22 +1599,25 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(blob_granule_file_load_error()); } + inputBytes += deltaData[i].size(); } + inputBytes += files[chunkIdx].newDeltas.expectedSize(); + // materialize rows from chunk chunkRows = materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData); + outputBytes += chunkRows.expectedSize(); + results.arena().dependsOn(chunkRows.arena()); results.append(results.arena(), chunkRows.begin(), chunkRows.size()); - if (loadIds[chunkIdx].snapshotId.present()) { - granuleContext.free_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext); - } - for (int i = 0; i < loadIds[chunkIdx].deltaIds.size(); i++) { - granuleContext.free_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext); - } + // free once done by forcing FreeHandles to trigger + loadIds[chunkIdx].freeHandles.clear(); } + stats.inputBytes = inputBytes; + stats.outputBytes = outputBytes; return ErrorOr(results); } catch (Error& e) { return ErrorOr(e); @@ -976,7 +1641,7 @@ const EncryptCipherRandomSalt encryptSalt = deterministicRandom()->randomUInt64( Standalone getBaseCipher() { Standalone baseCipher = makeString(AES_256_KEY_LENGTH); - generateRandomData(mutateString(baseCipher), baseCipher.size()); + deterministicRandom()->randomBytes(mutateString(baseCipher), baseCipher.size()); return baseCipher; } @@ -996,7 +1661,7 @@ BlobGranuleCipherKeysCtx getCipherKeysCtx(Arena& arena) { cipherKeysCtx.headerCipherKey.baseCipher = StringRef(arena, encryptBaseCipher); cipherKeysCtx.ivRef = makeString(AES_256_IV_LENGTH, arena); - generateRandomData(mutateString(cipherKeysCtx.ivRef), AES_256_IV_LENGTH); + deterministicRandom()->randomBytes(mutateString(cipherKeysCtx.ivRef), AES_256_IV_LENGTH); return cipherKeysCtx; } @@ -1007,23 +1672,13 @@ TEST_CASE("/blobgranule/files/applyDelta") { printf("Testing blob granule delta applying\n"); Arena a; - // do this 2 phase arena creation of string refs instead of LiteralStringRef because there is no char* StringRef - // constructor, and valgrind might complain if the stringref data isn't in the arena - std::string sk_a = "A"; - std::string sk_ab = "AB"; - std::string sk_b = "B"; - std::string sk_c = "C"; - std::string sk_z = "Z"; - std::string sval1 = "1"; - std::string sval2 = "2"; - - StringRef k_a = StringRef(a, sk_a); - StringRef k_ab = StringRef(a, sk_ab); - StringRef k_b = StringRef(a, sk_b); - StringRef k_c = StringRef(a, sk_c); - StringRef k_z = StringRef(a, sk_z); - StringRef val1 = StringRef(a, sval1); - StringRef val2 = StringRef(a, sval2); + StringRef k_a = StringRef(a, "A"_sr); + StringRef k_ab = StringRef(a, "AB"_sr); + StringRef k_b = StringRef(a, "B"_sr); + StringRef k_c = StringRef(a, "C"_sr); + StringRef k_z = StringRef(a, "Z"_sr); + StringRef val1 = StringRef(a, "1"_sr); + StringRef val2 = StringRef(a, "2"_sr); std::map data; data.insert({ k_a, val1 }); @@ -1150,92 +1805,440 @@ TEST_CASE("/blobgranule/files/applyDelta") { return Void(); } -// picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n an 2^n+1 -int randomExp(int minExp, int maxExp) { - if (minExp == maxExp) { // N=2, case - return 1 << minExp; +void checkDeltaAtVersion(const ParsedDeltaBoundaryRef& expected, + const DeltaBoundaryRef& boundary, + Version beginVersion, + Version readVersion) { + ParsedDeltaBoundaryRef actual = deltaAtVersion(boundary, beginVersion, readVersion); + ASSERT(expected.clearAfter == actual.clearAfter); + ASSERT(expected.op == actual.op); + if (expected.isSet()) { + ASSERT(expected.value == actual.value); + } else { + ASSERT(actual.value.empty()); } - int val = 1 << deterministicRandom()->randomInt(minExp, maxExp); - ASSERT(val > 0); - return deterministicRandom()->randomInt(val, val * 2); } -void checkEmpty(const Value& serialized, Key begin, Key end, Optional cipherKeysCtx) { - std::map result; - Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result, cipherKeysCtx); +TEST_CASE("/blobgranule/files/deltaAtVersion") { + Arena ar; + std::string keyStr = "k"; + std::string aStr = "a"; + + KeyRef key(ar, keyStr); + ValueAndVersionRef vv_a_3(3, ValueRef(ar, aStr)); + ValueAndVersionRef vv_clear_5(5); + + ParsedDeltaBoundaryRef resultEmpty(key, false); + ParsedDeltaBoundaryRef resultEmptyWithClear(key, true); + ParsedDeltaBoundaryRef resultSetA(key, false, vv_a_3); + ParsedDeltaBoundaryRef resultClearA(key, true, vv_clear_5); + + // test empty boundary ref + DeltaBoundaryRef boundaryEmpty; + boundaryEmpty.key = key; + checkDeltaAtVersion(resultEmpty, boundaryEmpty, 0, 2); + + // test empty boundary with clear + DeltaBoundaryRef boundaryEmptyWithClear; + boundaryEmptyWithClear.key = key; + boundaryEmptyWithClear.clearVersion = 5; + + // higher read version includes clear + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 0, 5); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 0, 10); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 2, 5); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 2, 10); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 5, 10); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 5, 5); + + // lower read version does not include clear + checkDeltaAtVersion(resultEmpty, boundaryEmptyWithClear, 0, 4); + checkDeltaAtVersion(resultEmpty, boundaryEmptyWithClear, 3, 4); + + // higher read version but also higher beginVersion does not include clear + checkDeltaAtVersion(resultEmpty, boundaryEmptyWithClear, 6, 10); + + // check values + DeltaBoundaryRef fullBoundary; + fullBoundary.key = key; + fullBoundary.values.push_back(ar, vv_a_3); + fullBoundary.values.push_back(ar, vv_clear_5); + fullBoundary.clearVersion = 5; + + checkDeltaAtVersion(resultEmpty, fullBoundary, 0, 2); + checkDeltaAtVersion(resultEmpty, fullBoundary, 6, 10); + checkDeltaAtVersion(resultEmpty, fullBoundary, 4, 4); + + checkDeltaAtVersion(resultSetA, fullBoundary, 0, 3); + checkDeltaAtVersion(resultSetA, fullBoundary, 3, 4); + + checkDeltaAtVersion(resultClearA, fullBoundary, 0, 5); + checkDeltaAtVersion(resultClearA, fullBoundary, 0, 10); + checkDeltaAtVersion(resultClearA, fullBoundary, 3, 5); + checkDeltaAtVersion(resultClearA, fullBoundary, 4, 5); + + return Void(); +} + +void checkSnapshotEmpty(const Value& serialized, Key begin, Key end, Optional cipherKeysCtx) { + Standalone fileNameRef = StringRef(); + Standalone> result = + loadSnapshotFile(fileNameRef, serialized, KeyRangeRef(begin, end), cipherKeysCtx); ASSERT(result.empty()); } // endIdx is exclusive -void checkRead(const Standalone& snapshot, - const Value& serialized, - int beginIdx, - int endIdx, - Optional cipherKeysCtx) { +void checkSnapshotRead(const Standalone& fileNameRef, + const Standalone& snapshot, + const Value& serialized, + int beginIdx, + int endIdx, + Optional cipherKeysCtx) { ASSERT(beginIdx < endIdx); ASSERT(endIdx <= snapshot.size()); - std::map result; KeyRef beginKey = snapshot[beginIdx].key; Key endKey = endIdx == snapshot.size() ? keyAfter(snapshot.back().key) : snapshot[endIdx].key; KeyRangeRef range(beginKey, endKey); - Arena ar = loadSnapshotFile(serialized, range, result, cipherKeysCtx); + fmt::print("Reading [{0} - {1})\n", beginKey.printable(), endKey.printable()); + + Standalone> result = + loadSnapshotFile(fileNameRef, serialized, range, cipherKeysCtx); if (result.size() != endIdx - beginIdx) { fmt::print("Read {0} rows != {1}\n", result.size(), endIdx - beginIdx); } + + if (BG_FILES_TEST_DEBUG) { + fmt::print("Expected Data {0}:\n", result.size()); + for (auto& it : result) { + fmt::print(" {0}=\n", it.key.printable()); + } + fmt::print("Actual Data {0}:\n", endIdx - beginIdx); + for (int i = beginIdx; i < endIdx; i++) { + fmt::print(" {0}=\n", snapshot[i].key.printable()); + } + } + ASSERT(result.size() == endIdx - beginIdx); for (auto& it : result) { - if (it.first != snapshot[beginIdx].key) { - fmt::print("Key {0} != {1}\n", it.first.printable(), snapshot[beginIdx].key.printable()); + ASSERT(it.isSet()); + if (it.key != snapshot[beginIdx].key) { + fmt::print("Key {0} != {1}\n", it.key.printable(), snapshot[beginIdx].key.printable()); } - ASSERT(it.first == snapshot[beginIdx].key); - if (it.first != snapshot[beginIdx].key) { + ASSERT(it.key == snapshot[beginIdx].key); + if (it.key != snapshot[beginIdx].key) { fmt::print("Value {0} != {1} for Key {2}\n", - it.second.printable(), + it.value.printable(), snapshot[beginIdx].value.printable(), - it.first.printable()); + it.key.printable()); } - ASSERT(it.second == snapshot[beginIdx].value); + ASSERT(it.value == snapshot[beginIdx].value); beginIdx++; } } -TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { - // snapshot files are likely to have a non-trivial shared prefix since they're for a small contiguous key range - std::string sharedPrefix = deterministicRandom()->randomUniqueID().toString(); - int uidSize = sharedPrefix.size(); - int sharedPrefixLen = deterministicRandom()->randomInt(0, uidSize); - int targetKeyLength = deterministicRandom()->randomInt(4, uidSize); - sharedPrefix = sharedPrefix.substr(0, sharedPrefixLen) + "_"; +namespace { - int targetValueLen = randomExp(0, 12); - int targetChunks = randomExp(0, 9); - int targetDataBytes = randomExp(0, 25); +size_t uidSize = 32; - std::unordered_set usedKeys; - Standalone data; - int totalDataBytes = 0; - while (totalDataBytes < targetDataBytes) { - int keySize = deterministicRandom()->randomInt(targetKeyLength / 2, targetKeyLength * 3 / 2); - keySize = std::min(keySize, uidSize); - std::string key = sharedPrefix + deterministicRandom()->randomUniqueID().toString().substr(0, keySize); - if (usedKeys.insert(key).second) { - int valueSize = deterministicRandom()->randomInt(targetValueLen / 2, targetValueLen * 3 / 2); - std::string value = deterministicRandom()->randomUniqueID().toString(); - if (value.size() > valueSize) { - value = value.substr(0, valueSize); - } - if (value.size() < valueSize) { - value += std::string(valueSize - value.size(), 'x'); - } +struct KeyValueGen { + Arena ar; + std::string sharedPrefix; + int targetKeyLength; + int targetValueLength; + std::set usedKeys; + std::vector usedKeysList; + double clearFrequency; + double clearUnsetFrequency; + double updateExistingKeyFrequency; + int minVersionIncrease; + int maxVersionIncrease; + int targetMutationsPerDelta; + KeyRange allRange; - data.push_back_deep(data.arena(), KeyValueRef(KeyRef(key), ValueRef(value))); - totalDataBytes += key.size() + value.size(); + Version version = 0; + + // encryption/compression settings + // TODO: possibly different cipher keys or meta context per file? + Optional cipherKeys; + Optional compressFilter; + + KeyValueGen() { + sharedPrefix = deterministicRandom()->randomUniqueID().toString(); + ASSERT(sharedPrefix.size() == uidSize); + int sharedPrefixLen = deterministicRandom()->randomInt(0, uidSize); + targetKeyLength = deterministicRandom()->randomInt(4, uidSize); + sharedPrefix = sharedPrefix.substr(0, sharedPrefixLen) + "_"; + targetValueLength = deterministicRandom()->randomExp(0, 12); + allRange = KeyRangeRef(StringRef(sharedPrefix), + sharedPrefix.size() == 0 ? "\xff"_sr : strinc(StringRef(sharedPrefix))); + + if (deterministicRandom()->coinflip()) { + clearFrequency = 0.0; + clearUnsetFrequency = 0.0; + } else { + clearFrequency = deterministicRandom()->random01() / 2; + // clearing an unset value has no effect on the results, we mostly just want to make sure the format doesn't + // barf + clearUnsetFrequency = deterministicRandom()->random01() / 10; + } + if (deterministicRandom()->random01() < 0.2) { + // no updates, only new writes + updateExistingKeyFrequency = 0.0; + } else { + updateExistingKeyFrequency = deterministicRandom()->random01(); + } + if (deterministicRandom()->coinflip()) { + // sequential versions + minVersionIncrease = 1; + maxVersionIncrease = 2; + } else { + minVersionIncrease = deterministicRandom()->randomExp(0, 25); + maxVersionIncrease = minVersionIncrease + deterministicRandom()->randomExp(0, 25); + } + if (deterministicRandom()->coinflip()) { + targetMutationsPerDelta = 1; + } else { + targetMutationsPerDelta = deterministicRandom()->randomExp(1, 5); + } + + if (deterministicRandom()->coinflip()) { + cipherKeys = getCipherKeysCtx(ar); + } + if (deterministicRandom()->coinflip()) { + compressFilter = CompressionUtils::getRandomFilter(); } } + Optional newKey() { + for (int nAttempt = 0; nAttempt < 1000; nAttempt++) { + size_t keySize = deterministicRandom()->randomInt(targetKeyLength / 2, targetKeyLength * 3 / 2); + keySize = std::min(keySize, uidSize); + std::string key = sharedPrefix + deterministicRandom()->randomUniqueID().toString().substr(0, keySize); + if (usedKeys.insert(key).second) { + StringRef k(ar, key); + usedKeysList.push_back(k); + return k; + } + } + return {}; + } + + StringRef value() { + int valueSize = deterministicRandom()->randomInt(targetValueLength / 2, targetValueLength * 3 / 2); + std::string value = deterministicRandom()->randomUniqueID().toString(); + if (value.size() > valueSize) { + value = value.substr(0, valueSize); + } + if (value.size() < valueSize) { + // repeated string so it's compressible + value += std::string(valueSize - value.size(), 'x'); + } + return StringRef(ar, value); + } + + KeyRef randomUsedKey() const { return usedKeysList[deterministicRandom()->randomInt(0, usedKeysList.size())]; } + + KeyRange randomKeyRange() const { + ASSERT(!usedKeysList.empty()); + Key begin = randomUsedKey(); + if (deterministicRandom()->coinflip()) { + begin = keyAfter(begin); + } + if (usedKeysList.size() == 1) { + return KeyRange(KeyRangeRef(begin, keyAfter(begin))); + } else { + Key end = begin; + while (end == begin) { + end = randomUsedKey(); + } + if (deterministicRandom()->coinflip()) { + end = keyAfter(end); + } + if (begin < end) { + return KeyRangeRef(begin, end); + } else { + return KeyRangeRef(end, begin); + } + } + } + + StringRef keyForUpdate(double probUseExisting) { + if (!usedKeysList.empty() && deterministicRandom()->random01() < probUseExisting) { + return randomUsedKey(); + } else { + auto k = newKey(); + if (k.present()) { + return k.get(); + } else { + // use existing key instead + ASSERT(!usedKeysList.empty()); + return randomUsedKey(); + } + } + } + + Version nextVersion() { + Version jump = deterministicRandom()->randomInt(minVersionIncrease, maxVersionIncrease); + version += jump; + return version; + } + + MutationRef newMutation() { + if (deterministicRandom()->random01() < clearFrequency) { + // The algorithm for generating clears of varying sizes is, to generate clear sizes based on an exponential + // distribution, such that the expected value of the clear size is 2. + int clearWidth = 1; + while (clearWidth < usedKeys.size() && deterministicRandom()->coinflip()) { + clearWidth *= 2; + } + bool clearPastEnd = deterministicRandom()->coinflip(); + if (clearPastEnd) { + clearWidth--; + } + StringRef begin = keyForUpdate(1.0 - clearUnsetFrequency); + std::string beginStr = begin.toString(); + auto it = usedKeys.find(beginStr); + ASSERT(it != usedKeys.end()); + while (it != usedKeys.end() && clearWidth > 0) { + it++; + clearWidth--; + } + if (it == usedKeys.end()) { + it--; + clearPastEnd = true; + } + std::string endKey = *it; + if (clearPastEnd) { + Key end = keyAfter(StringRef(ar, endKey)); + ar.dependsOn(end.arena()); + return MutationRef(MutationRef::ClearRange, begin, end); + } else { + // clear up to end + return MutationRef(MutationRef::ClearRange, begin, StringRef(ar, endKey)); + } + + } else { + return MutationRef(MutationRef::SetValue, keyForUpdate(updateExistingKeyFrequency), value()); + } + } + + MutationsAndVersionRef newDelta() { + Version v = nextVersion(); + int mutationCount = deterministicRandom()->randomInt(1, targetMutationsPerDelta * 2); + MutationsAndVersionRef ret(v, v); + for (int i = 0; i < mutationCount; i++) { + ret.mutations.push_back(ar, newMutation()); + } + return ret; + } +}; + +} // namespace + +Standalone genSnapshot(KeyValueGen& kvGen, int targetDataBytes) { + Standalone data; + int totalDataBytes = 0; + while (totalDataBytes < targetDataBytes) { + Optional key = kvGen.newKey(); + if (!key.present()) { + break; + } + StringRef value = kvGen.value(); + + data.push_back_deep(data.arena(), KeyValueRef(KeyRef(key.get()), ValueRef(value))); + totalDataBytes += key.get().size() + value.size(); + } + std::sort(data.begin(), data.end(), KeyValueRef::OrderByKey()); + return data; +} + +Standalone genDeltas(KeyValueGen& kvGen, int targetBytes) { + Standalone data; + int totalDataBytes = 0; + while (totalDataBytes < targetBytes) { + data.push_back(data.arena(), kvGen.newDelta()); + totalDataBytes += data.back().expectedSize(); + } + return data; +} + +TEST_CASE("/blobgranule/files/validateEncryptionCompression") { + KeyValueGen kvGen; + + int targetSnapshotChunks = deterministicRandom()->randomExp(0, 9); + int targetDeltaChunks = deterministicRandom()->randomExp(0, 8); + int targetDataBytes = deterministicRandom()->randomExp(12, 25); + int targetSnapshotBytes = (int)(deterministicRandom()->randomInt(0, targetDataBytes)); + int targetDeltaBytes = targetDataBytes - targetSnapshotBytes; + + int targetSnapshotChunkSize = targetSnapshotBytes / targetSnapshotChunks; + int targetDeltaChunkSize = targetDeltaBytes / targetDeltaChunks; + + Standalone snapshotData = genSnapshot(kvGen, targetSnapshotBytes); + Standalone deltaData = genDeltas(kvGen, targetDeltaBytes); + fmt::print("{0} snapshot rows and {1} deltas\n", snapshotData.size(), deltaData.size()); + + Standalone fileNameRef = StringRef(); + + Arena ar; + BlobGranuleCipherKeysCtx cipherKeys = getCipherKeysCtx(ar); + std::vector encryptionModes = { false, true }; + std::vector> compressionModes; + compressionModes.insert( + compressionModes.end(), CompressionUtils::supportedFilters.begin(), CompressionUtils::supportedFilters.end()); + + std::vector snapshotValues; + for (bool encryptionMode : encryptionModes) { + Optional keys = encryptionMode ? cipherKeys : Optional(); + for (auto& compressionMode : compressionModes) { + Value v = + serializeChunkedSnapshot(fileNameRef, snapshotData, targetSnapshotChunkSize, compressionMode, keys); + fmt::print("snapshot({0}, {1}): {2}\n", + encryptionMode, + compressionMode.present() ? CompressionUtils::toString(compressionMode.get()) : "", + v.size()); + for (auto& v2 : snapshotValues) { + ASSERT(v != v2); + } + snapshotValues.push_back(v); + } + } + fmt::print("Validated {0} encryption/compression combos for snapshot\n", snapshotValues.size()); + + std::vector deltaValues; + for (bool encryptionMode : encryptionModes) { + Optional keys = encryptionMode ? cipherKeys : Optional(); + for (auto& compressionMode : compressionModes) { + Value v = serializeChunkedDeltaFile( + fileNameRef, deltaData, kvGen.allRange, targetDeltaChunkSize, compressionMode, keys); + fmt::print("delta({0}, {1}): {2}\n", + encryptionMode, + compressionMode.present() ? CompressionUtils::toString(compressionMode.get()) : "", + v.size()); + for (auto& v2 : deltaValues) { + ASSERT(v != v2); + } + deltaValues.push_back(v); + } + } + fmt::print("Validated {0} encryption/compression combos for delta\n", deltaValues.size()); + + return Void(); +} + +TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { + // snapshot files are likely to have a non-trivial shared prefix since they're for a small contiguous key range + KeyValueGen kvGen; + + int targetChunks = deterministicRandom()->randomExp(0, 9); + int targetDataBytes = deterministicRandom()->randomExp(0, 25); + int targetChunkSize = targetDataBytes / targetChunks; + Standalone fnameRef = StringRef(std::string("test")); + + Standalone data = genSnapshot(kvGen, targetDataBytes); int maxExp = 0; while (1 << maxExp < data.size()) { @@ -1248,24 +2251,10 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { ASSERT(data[i].key < data[i + 1].key); } - fmt::print( - "Constructing snapshot with {0} rows, {1} bytes, and {2} chunks\n", data.size(), totalDataBytes, targetChunks); + fmt::print("Constructing snapshot with {0} rows, {1} chunks\n", data.size(), targetChunks); - Optional cipherKeysCtx = Optional(); - Arena arena; - if (deterministicRandom()->coinflip()) { - cipherKeysCtx = getCipherKeysCtx(arena); - } - - Optional compressFilter; - if (deterministicRandom()->coinflip()) { -#ifdef ZLIB_LIB_SUPPORTED - compressFilter = CompressionFilter::GZIP; -#else - compressFilter = CompressionFilter::NONE; -#endif - } - Value serialized = serializeChunkedSnapshot(data, targetChunks, compressFilter, cipherKeysCtx); + Value serialized = + serializeChunkedSnapshot(fnameRef, data, targetChunkSize, kvGen.compressFilter, kvGen.cipherKeys); fmt::print("Snapshot serialized! {0} bytes\n", serialized.size()); @@ -1276,31 +2265,851 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { fmt::print("Initial read starting\n"); - checkRead(data, serialized, 0, data.size(), cipherKeysCtx); + checkSnapshotRead(fnameRef, data, serialized, 0, data.size(), kvGen.cipherKeys); fmt::print("Initial read complete\n"); if (data.size() > 1) { for (int i = 0; i < std::min(100, data.size() * 2); i++) { - int width = randomExp(0, maxExp); + int width = deterministicRandom()->randomExp(0, maxExp); ASSERT(width <= data.size()); int start = deterministicRandom()->randomInt(0, data.size() - width); - checkRead(data, serialized, start, start + width, cipherKeysCtx); + checkSnapshotRead(fnameRef, data, serialized, start, start + width, kvGen.cipherKeys); } fmt::print("Doing empty checks\n"); int randomIdx = deterministicRandom()->randomInt(0, data.size() - 1); - checkEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key, cipherKeysCtx); + checkSnapshotEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key, kvGen.cipherKeys); } else { fmt::print("Doing empty checks\n"); } - checkEmpty(serialized, normalKeys.begin, data.front().key, cipherKeysCtx); - checkEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"), cipherKeysCtx); - checkEmpty(serialized, keyAfter(data.back().key), normalKeys.end, cipherKeysCtx); - checkEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end, cipherKeysCtx); + checkSnapshotEmpty(serialized, normalKeys.begin, data.front().key, kvGen.cipherKeys); + checkSnapshotEmpty(serialized, normalKeys.begin, "\x00"_sr, kvGen.cipherKeys); + checkSnapshotEmpty(serialized, keyAfter(data.back().key), normalKeys.end, kvGen.cipherKeys); + checkSnapshotEmpty(serialized, "\xfe"_sr, normalKeys.end, kvGen.cipherKeys); fmt::print("Snapshot format test done!\n"); return Void(); } + +void checkDeltaRead(const KeyValueGen& kvGen, + const KeyRangeRef& range, + Version beginVersion, + Version readVersion, + const Standalone& data, + StringRef* serialized) { + // expected answer + std::map expectedData; + Version lastFileEndVersion = 0; + + fmt::print("Delta Read [{0} - {1}) @ {2} - {3}\n", + range.begin.printable(), + range.end.printable(), + beginVersion, + readVersion); + + applyDeltasByVersion(data, range, beginVersion, readVersion, lastFileEndVersion, expectedData); + + // actual answer + std::string filename = randomBGFilename( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta"); + Standalone chunk; + chunk.deltaFiles.emplace_back_deep( + chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys); + chunk.keyRange = kvGen.allRange; + chunk.includedVersion = readVersion; + chunk.snapshotVersion = invalidVersion; + + RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized); + + if (expectedData.size() != actualData.size()) { + fmt::print("Expected Data {0}:\n", expectedData.size()); + /*for (auto& it : expectedData) { + fmt::print(" {0}=\n", it.first.printable()); + }*/ + fmt::print("Actual Data {0}:\n", actualData.size()); + /*for (auto& it : actualData) { + fmt::print(" {0}=\n", it.key.printable()); + }*/ + } + + ASSERT(expectedData.size() == actualData.size()); + int i = 0; + for (auto& it : expectedData) { + ASSERT(it.first == actualData[i].key); + ASSERT(it.second == actualData[i].value); + i++; + } +} + +static std::tuple randomizeKeyAndVersions(const KeyValueGen& kvGen, + const Standalone data) { + // either randomize just keyrange, just version range, or both + double rand = deterministicRandom()->randomInt(0, 3); + bool randomizeKeyRange = rand == 0 || rand == 2; + bool randomizeVersionRange = rand == 1 || rand == 2; + KeyRange readRange = kvGen.allRange; + Version beginVersion = 0; + Version readVersion = data.back().version; + + if (randomizeKeyRange) { + readRange = kvGen.randomKeyRange(); + } + + if (randomizeVersionRange) { + if (deterministicRandom()->coinflip()) { + beginVersion = 0; + } else { + beginVersion = data[deterministicRandom()->randomInt(0, data.size())].version; + beginVersion += deterministicRandom()->randomInt(0, 3) - 1; // randomize between -1, 0, and +1 + } + readVersion = data[deterministicRandom()->randomInt(0, data.size())].version; + readVersion += deterministicRandom()->randomInt(0, 3) - 1; // randomize between -1, 0, and +1 + if (readVersion < beginVersion) { + std::swap(beginVersion, readVersion); + } + } + + return { readRange, beginVersion, readVersion }; +} + +TEST_CASE("/blobgranule/files/deltaFormatUnitTest") { + KeyValueGen kvGen; + Standalone fileNameRef = StringRef(std::string("test")); + + int targetChunks = deterministicRandom()->randomExp(0, 8); + int targetDataBytes = deterministicRandom()->randomExp(0, 21); + int targetChunkSize = targetDataBytes / targetChunks; + + Standalone data = genDeltas(kvGen, targetDataBytes); + + fmt::print("Deltas ({0})\n", data.size()); + /*for (auto& it : data) { + fmt::print(" {0}) ({1})\n", it.version, it.mutations.size()); + for (auto& it2 : it.mutations) { + if (it2.type == MutationRef::Type::SetValue) { + fmt::print(" {0}=\n", it2.param1.printable()); + } else { + fmt::print(" {0} - {1}\n", it2.param1.printable(), it2.param2.printable()); + } + } + }*/ + Value serialized = serializeChunkedDeltaFile( + fileNameRef, data, kvGen.allRange, targetChunkSize, kvGen.compressFilter, kvGen.cipherKeys); + + // check whole file + checkDeltaRead(kvGen, kvGen.allRange, 0, data.back().version, data, &serialized); + + for (int i = 0; i < std::min((size_t)100, kvGen.usedKeysList.size() * data.size()); i++) { + auto params = randomizeKeyAndVersions(kvGen, data); + checkDeltaRead(kvGen, std::get<0>(params), std::get<1>(params), std::get<2>(params), data, &serialized); + } + + return Void(); +} + +void checkGranuleRead(const KeyValueGen& kvGen, + const KeyRangeRef& range, + Version beginVersion, + Version readVersion, + const Standalone& snapshotData, + const Standalone& deltaData, + const Value& serializedSnapshot, + const std::vector>& serializedDeltas, + const Standalone& inMemoryDeltas) { + // expected answer + std::map expectedData; + if (beginVersion == 0) { + for (auto& it : snapshotData) { + if (range.contains(it.key)) { + expectedData.insert({ it.key, it.value }); + } + } + } + Version lastFileEndVersion = 0; + applyDeltasByVersion(deltaData, range, beginVersion, readVersion, lastFileEndVersion, expectedData); + + // actual answer + Standalone chunk; + if (beginVersion == 0) { + std::string snapshotFilename = randomBGFilename( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), 0, ".snapshot"); + chunk.snapshotFile = BlobFilePointerRef( + chunk.arena(), snapshotFilename, 0, serializedSnapshot.size(), serializedSnapshot.size(), kvGen.cipherKeys); + } + int deltaIdx = 0; + while (deltaIdx < serializedDeltas.size() && serializedDeltas[deltaIdx].first < beginVersion) { + deltaIdx++; + } + std::vector deltaPtrsVector; + while (deltaIdx < serializedDeltas.size()) { + std::string deltaFilename = randomBGFilename( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta"); + size_t fsize = serializedDeltas[deltaIdx].second.size(); + chunk.deltaFiles.emplace_back_deep(chunk.arena(), deltaFilename, 0, fsize, fsize, kvGen.cipherKeys); + deltaPtrsVector.push_back(serializedDeltas[deltaIdx].second); + + if (serializedDeltas[deltaIdx].first >= readVersion) { + break; + } + deltaIdx++; + } + StringRef deltaPtrs[deltaPtrsVector.size()]; + for (int i = 0; i < deltaPtrsVector.size(); i++) { + deltaPtrs[i] = deltaPtrsVector[i]; + } + + // add in memory deltas + chunk.arena().dependsOn(inMemoryDeltas.arena()); + for (auto& it : inMemoryDeltas) { + if (beginVersion <= it.version && it.version <= readVersion) { + chunk.newDeltas.push_back(chunk.arena(), it); + } + } + + chunk.keyRange = kvGen.allRange; + chunk.includedVersion = readVersion; + chunk.snapshotVersion = (beginVersion == 0) ? 0 : invalidVersion; + + Optional snapshotPtr; + if (beginVersion == 0) { + snapshotPtr = serializedSnapshot; + } + RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs); + + if (expectedData.size() != actualData.size()) { + fmt::print("Expected Size {0} != Actual Size {1}\n", expectedData.size(), actualData.size()); + } + if (BG_FILES_TEST_DEBUG) { + fmt::print("Expected Data {0}:\n", expectedData.size()); + for (auto& it : expectedData) { + fmt::print(" {0}=\n", it.first.printable()); + } + fmt::print("Actual Data {0}:\n", actualData.size()); + for (auto& it : actualData) { + fmt::print(" {0}=\n", it.key.printable()); + } + } + + ASSERT(expectedData.size() == actualData.size()); + int i = 0; + for (auto& it : expectedData) { + if (it.first != actualData[i].key) { + fmt::print("expected {0} != actual {1}\n", it.first.printable(), actualData[i].key.printable()); + } + ASSERT(it.first == actualData[i].key); + ASSERT(it.second == actualData[i].value); + i++; + } +} + +TEST_CASE("/blobgranule/files/granuleReadUnitTest") { + KeyValueGen kvGen; + Standalone fileNameRef = StringRef(std::string("testSnap")); + + int targetSnapshotChunks = deterministicRandom()->randomExp(0, 9); + int targetDeltaChunks = deterministicRandom()->randomExp(0, 8); + int targetDataBytes = deterministicRandom()->randomExp(12, 25); + int targetSnapshotBytes = (int)(deterministicRandom()->randomInt(0, targetDataBytes)); + int targetDeltaBytes = targetDataBytes - targetSnapshotBytes; + + if (BG_FILES_TEST_DEBUG) { + fmt::print("Snapshot Chunks: {0}\nDelta Chunks: {1}\nSnapshot Bytes: {2}\nDelta Bytes: {3}\n", + targetSnapshotChunks, + targetDeltaChunks, + targetSnapshotBytes, + targetDeltaBytes); + } + + int targetSnapshotChunkSize = targetSnapshotBytes / targetSnapshotChunks; + int targetDeltaChunkSize = targetDeltaBytes / targetDeltaChunks; + + Standalone snapshotData = genSnapshot(kvGen, targetSnapshotBytes); + if (BG_FILES_TEST_DEBUG) { + fmt::print("Snapshot data: {0}\n", snapshotData.size()); + for (auto& it : snapshotData) { + fmt::print(" {0}=\n", it.key.printable()); + } + } + Standalone deltaData = genDeltas(kvGen, targetDeltaBytes); + fmt::print("{0} snapshot rows and {1} deltas\n", snapshotData.size(), deltaData.size()); + + if (BG_FILES_TEST_DEBUG) { + fmt::print("Delta data: {0}\n", deltaData.size()); + for (auto& it : deltaData) { + fmt::print(" {0}) ({1})\n", it.version, it.mutations.size()); + for (auto& it2 : it.mutations) { + if (it2.type == MutationRef::Type::SetValue) { + fmt::print(" {0}=\n", it2.param1.printable()); + } else { + fmt::print(" {0} - {1}\n", it2.param1.printable(), it2.param2.printable()); + } + } + } + } + + Value serializedSnapshot = serializeChunkedSnapshot( + fileNameRef, snapshotData, targetSnapshotChunkSize, kvGen.compressFilter, kvGen.cipherKeys); + + // split deltas up across multiple files + int deltaFiles = std::min(deltaData.size(), deterministicRandom()->randomInt(1, 21)); + int deltasPerFile = deltaData.size() / deltaFiles + 1; + std::vector> serializedDeltaFiles; + Standalone inMemoryDeltas; + serializedDeltaFiles.reserve(deltaFiles); + for (int i = 0; i < deltaFiles; i++) { + Standalone fileData; + int j; + for (j = i * deltasPerFile; j < (i + 1) * deltasPerFile && j < deltaData.size(); j++) { + fileData.push_back_deep(fileData.arena(), deltaData[j]); + } + if (!fileData.empty()) { + if (j == deltaData.size() && deterministicRandom()->coinflip()) { + // if it's the last set of deltas, sometimes make them the memory deltas instead + fmt::print("Memory Deltas {0} - {1}\n", fileData.front().version, fileData.back().version); + inMemoryDeltas = fileData; + } else { + fmt::print("Delta file {0} - {1}\n", fileData.front().version, fileData.back().version); + Standalone fileNameRef = StringRef("delta" + std::to_string(i)); + Value serializedDelta = serializeChunkedDeltaFile(fileNameRef, + fileData, + kvGen.allRange, + targetDeltaChunkSize, + kvGen.compressFilter, + kvGen.cipherKeys); + serializedDeltaFiles.emplace_back(fileData.back().version, serializedDelta); + } + } + } + + fmt::print("Full test\n"); + checkGranuleRead(kvGen, + kvGen.allRange, + 0, + deltaData.back().version, + snapshotData, + deltaData, + serializedSnapshot, + serializedDeltaFiles, + inMemoryDeltas); + + // prevent overflow by doing min before multiply + int maxRuns = 100; + int snapshotAndDeltaSize = 5 + std::min(maxRuns, snapshotData.size()) * std::min(maxRuns, deltaData.size()); + int lim = std::min(maxRuns, snapshotAndDeltaSize); + for (int i = 0; i < lim; i++) { + auto params = randomizeKeyAndVersions(kvGen, deltaData); + fmt::print("Partial test {0}: [{1} - {2}) @ {3} - {4}\n", + i, + std::get<0>(params).begin.printable(), + std::get<0>(params).end.printable(), + std::get<1>(params), + std::get<2>(params)); + checkGranuleRead(kvGen, + std::get<0>(params), + std::get<1>(params), + std::get<2>(params), + snapshotData, + deltaData, + serializedSnapshot, + serializedDeltaFiles, + inMemoryDeltas); + } + + return Void(); +} + +// performance micro-benchmarks + +struct FileSet { + std::tuple> snapshotFile; + std::vector>> deltaFiles; + Key commonPrefix; + KeyRange range; +}; + +std::pair parseFilename(const std::string& fname) { + auto dotPos = fname.find("."); + ASSERT(dotPos > 0); + std::string type = fname.substr(dotPos + 1); + ASSERT(type == "snapshot" || type == "delta"); + auto lastUnderscorePos = fname.rfind("_"); + ASSERT('V' == fname[lastUnderscorePos + 1]); + std::string versionString = fname.substr(lastUnderscorePos + 2, dotPos); + Version version = std::stoll(versionString); + return { type, version }; +} + +Value loadFileData(std::string filename) { + std::ifstream input(filename, std::ios::binary); + ASSERT(input.good()); + + // copies all data into buffer + std::vector buffer(std::istreambuf_iterator(input), {}); + Value v(StringRef(&buffer[0], buffer.size())); + fmt::print("Loaded {0} file bytes from {1}\n", v.size(), filename); + + input.close(); + return v; +} + +struct CommonPrefixStats { + // for computing common prefix details and stats + Key key; + int len = -1; + int64_t totalKeySize = 0; + int totalKeys = 0; + int minKeySize = 1000000000; + int maxKeySize = 0; + + void addKey(const KeyRef& k) { + if (len == -1) { + key = k; + len = k.size(); + } else { + len = std::min(len, commonPrefixLength(k, key)); + } + totalKeys++; + totalKeySize += k.size(); + minKeySize = std::min(minKeySize, k.size()); + maxKeySize = std::max(maxKeySize, k.size()); + } + + Key done() { + ASSERT(len >= 0); + fmt::print("Common prefix: {0}\nCommon Prefix Length: {1}\nAverage Key Size: {2}\nMin Key Size: {3}, Max Key " + "Size: {4}\n", + key.substr(0, len).printable(), + len, + totalKeySize / totalKeys, + minKeySize, + maxKeySize); + return key.substr(0, len); + } +}; + +FileSet loadFileSet(std::string basePath, const std::vector& filenames) { + FileSet files; + CommonPrefixStats stats; + for (int i = 0; i < filenames.size(); i++) { + auto parts = parseFilename(filenames[i]); + std::string type = parts.first; + Version version = parts.second; + if (type == "snapshot") { + std::string fpath = basePath + filenames[i]; + Value data = loadFileData(fpath); + + Arena arena; + GranuleSnapshot file; + ObjectReader dataReader(data.begin(), Unversioned()); + dataReader.deserialize(FileIdentifierFor::value, file, arena); + Standalone parsed(file, arena); + + fmt::print("Loaded {0} rows from snapshot file\n", parsed.size()); + files.snapshotFile = { filenames[i], version, data, parsed }; + + for (auto& it : parsed) { + stats.addKey(it.key); + } + } else { + std::string fpath = basePath + filenames[i]; + Value data = loadFileData(fpath); + + Arena arena; + GranuleDeltas file; + ObjectReader dataReader(data.begin(), Unversioned()); + dataReader.deserialize(FileIdentifierFor::value, file, arena); + Standalone parsed(file, arena); + + fmt::print("Loaded {0} deltas from delta file\n", parsed.size()); + files.deltaFiles.push_back({ filenames[i], version, data, parsed }); + + for (auto& it : parsed) { + for (auto& it2 : it.mutations) { + stats.addKey(it2.param1); + if (it2.type == MutationRef::Type::ClearRange) { + stats.addKey(it2.param2); + } + } + } + } + } + + files.commonPrefix = stats.done(); + if (files.commonPrefix.size() == 0) { + files.range = normalKeys; + } else { + files.range = KeyRangeRef(files.commonPrefix, strinc(files.commonPrefix)); + } + fmt::print("Range: [{0} - {1})\n", files.range.begin.printable(), files.range.end.printable()); + + return files; +} + +int WRITE_RUNS = 5; + +std::pair doSnapshotWriteBench(const Standalone& data, + bool chunked, + Optional cipherKeys, + Optional compressionFilter) { + Standalone fileNameRef = StringRef(); + int64_t serializedBytes = 0; + double elapsed = -timer_monotonic(); + for (int runI = 0; runI < WRITE_RUNS; runI++) { + if (!chunked) { + serializedBytes = ObjectWriter::toValue(data, Unversioned()).size(); + } else { + serializedBytes = + serializeChunkedSnapshot(fileNameRef, data, 64 * 1024, compressionFilter, cipherKeys).size(); + } + } + elapsed += timer_monotonic(); + elapsed /= WRITE_RUNS; + return { serializedBytes, elapsed }; +} + +std::pair doDeltaWriteBench(const Standalone& data, + const KeyRangeRef& fileRange, + bool chunked, + Optional cipherKeys, + Optional compressionFilter) { + Standalone fileNameRef = StringRef(); + int64_t serializedBytes = 0; + double elapsed = -timer_monotonic(); + for (int runI = 0; runI < WRITE_RUNS; runI++) { + if (!chunked) { + serializedBytes = ObjectWriter::toValue(data, Unversioned()).size(); + } else { + serializedBytes = + serializeChunkedDeltaFile(fileNameRef, data, fileRange, 32 * 1024, compressionFilter, cipherKeys) + .size(); + } + } + elapsed += timer_monotonic(); + elapsed /= WRITE_RUNS; + return { serializedBytes, elapsed }; +} + +FileSet rewriteChunkedFileSet(const FileSet& fileSet, + Optional keys, + Optional compressionFilter) { + Standalone fileNameRef = StringRef(); + FileSet newFiles; + newFiles.snapshotFile = fileSet.snapshotFile; + newFiles.deltaFiles = fileSet.deltaFiles; + newFiles.commonPrefix = fileSet.commonPrefix; + newFiles.range = fileSet.range; + + std::get<2>(newFiles.snapshotFile) = + serializeChunkedSnapshot(fileNameRef, std::get<3>(newFiles.snapshotFile), 64 * 1024, compressionFilter, keys); + for (auto& deltaFile : newFiles.deltaFiles) { + std::get<2>(deltaFile) = serializeChunkedDeltaFile( + fileNameRef, std::get<3>(deltaFile), fileSet.range, 32 * 1024, compressionFilter, keys); + } + + return newFiles; +} + +int READ_RUNS = 20; +std::pair doReadBench(const FileSet& fileSet, + bool chunked, + KeyRange readRange, + bool clearAllAtEnd, + Optional keys, + Optional compressionFilter) { + Version readVersion = std::get<1>(fileSet.deltaFiles.back()); + + Standalone chunk; + StringRef deltaPtrs[fileSet.deltaFiles.size()]; + + MutationRef clearAllAtEndMutation; + if (clearAllAtEnd) { + clearAllAtEndMutation = MutationRef(MutationRef::Type::ClearRange, readRange.begin, readRange.end); + } + if (chunked) { + size_t snapshotSize = std::get<3>(fileSet.snapshotFile).size(); + chunk.snapshotFile = + BlobFilePointerRef(chunk.arena(), std::get<0>(fileSet.snapshotFile), 0, snapshotSize, snapshotSize, keys); + + for (int i = 0; i < fileSet.deltaFiles.size(); i++) { + size_t deltaSize = std::get<3>(fileSet.deltaFiles[i]).size(); + chunk.deltaFiles.emplace_back_deep( + chunk.arena(), std::get<0>(fileSet.deltaFiles[i]), 0, deltaSize, deltaSize, keys); + deltaPtrs[i] = std::get<2>(fileSet.deltaFiles[i]); + } + + if (clearAllAtEnd) { + readVersion++; + MutationsAndVersionRef lastDelta; + lastDelta.version = readVersion; + lastDelta.mutations.push_back(chunk.arena(), clearAllAtEndMutation); + + chunk.newDeltas.push_back_deep(chunk.arena(), lastDelta); + } + + chunk.keyRange = fileSet.range; + chunk.includedVersion = readVersion; + chunk.snapshotVersion = std::get<1>(fileSet.snapshotFile); + } + + int64_t serializedBytes = 0; + double elapsed = -timer_monotonic(); + for (int runI = 0; runI < READ_RUNS; runI++) { + if (!chunked) { + std::map data; + for (auto& it : std::get<3>(fileSet.snapshotFile)) { + data.insert({ it.key, it.value }); + } + Version lastFileEndVersion = 0; + for (auto& deltaFile : fileSet.deltaFiles) { + applyDeltasByVersion(std::get<3>(deltaFile), readRange, 0, readVersion, lastFileEndVersion, data); + } + if (clearAllAtEnd) { + applyDelta(readRange, clearAllAtEndMutation, data); + } + RangeResult actualData; + for (auto& it : data) { + actualData.push_back_deep(actualData.arena(), KeyValueRef(it.first, it.second)); + } + serializedBytes += actualData.expectedSize(); + } else { + RangeResult actualData = + materializeBlobGranule(chunk, readRange, 0, readVersion, std::get<2>(fileSet.snapshotFile), deltaPtrs); + serializedBytes += actualData.expectedSize(); + } + } + elapsed += timer_monotonic(); + elapsed /= READ_RUNS; + serializedBytes /= READ_RUNS; + return { serializedBytes, elapsed }; +} + +void printMetrics(int64_t diskBytes, double elapsed, int64_t processesBytes, int64_t logicalSize) { + double storageAmp = (1.0 * diskBytes) / logicalSize; + + double MBperCPUsec = (elapsed == 0.0) ? 0.0 : (processesBytes / 1024.0 / 1024.0) / elapsed; + fmt::print("{}", fmt::format(" {:.6} {:.6}", storageAmp, MBperCPUsec)); +} + +TEST_CASE("!/blobgranule/files/benchFromFiles") { + std::string basePath = "SET_ME"; + std::vector> fileSetNames = { { "SET_ME" } }; + Arena ar; + BlobGranuleCipherKeysCtx cipherKeys = getCipherKeysCtx(ar); + std::vector chunkModes = { false, true }; + std::vector encryptionModes = { false, true }; + std::vector> compressionModes; + compressionModes.push_back({}); + compressionModes.insert( + compressionModes.end(), CompressionUtils::supportedFilters.begin(), CompressionUtils::supportedFilters.end()); + + std::vector runNames = { "logical" }; + std::vector> snapshotMetrics; + std::vector> deltaMetrics; + + std::vector fileSets; + int64_t logicalSnapshotSize = 0; + int64_t logicalDeltaSize = 0; + for (auto& it : fileSetNames) { + FileSet fileSet = loadFileSet(basePath, it); + fileSets.push_back(fileSet); + logicalSnapshotSize += std::get<3>(fileSet.snapshotFile).expectedSize(); + for (auto& deltaFile : fileSet.deltaFiles) { + logicalDeltaSize += std::get<3>(deltaFile).expectedSize(); + } + } + snapshotMetrics.push_back({ logicalSnapshotSize, 0.0 }); + deltaMetrics.push_back({ logicalDeltaSize, 0.0 }); + + for (bool chunk : chunkModes) { + for (bool encrypt : encryptionModes) { + if (!chunk && encrypt) { + continue; + } + Optional keys = encrypt ? cipherKeys : Optional(); + for (auto& compressionFilter : compressionModes) { + if (!chunk && compressionFilter.present()) { + continue; + } + if (compressionFilter.present() && CompressionFilter::NONE == compressionFilter.get()) { + continue; + } + + std::string name; + if (!chunk) { + name = "old"; + } else { + if (encrypt) { + name += "ENC"; + } + if (compressionFilter.present()) { + name += "CMP"; + } + if (name.empty()) { + name = "chunked"; + } + } + runNames.push_back(name); + int64_t snapshotTotalBytes = 0; + double snapshotTotalElapsed = 0.0; + for (auto& fileSet : fileSets) { + auto res = doSnapshotWriteBench(std::get<3>(fileSet.snapshotFile), chunk, keys, compressionFilter); + snapshotTotalBytes += res.first; + snapshotTotalElapsed += res.second; + } + snapshotMetrics.push_back({ snapshotTotalBytes, snapshotTotalElapsed }); + + int64_t deltaTotalBytes = 0; + double deltaTotalElapsed = 0.0; + for (auto& fileSet : fileSets) { + for (auto& deltaFile : fileSet.deltaFiles) { + auto res = + doDeltaWriteBench(std::get<3>(deltaFile), fileSet.range, chunk, keys, compressionFilter); + deltaTotalBytes += res.first; + deltaTotalElapsed += res.second; + } + } + deltaMetrics.push_back({ deltaTotalBytes, deltaTotalElapsed }); + } + } + } + + fmt::print("\n\n\n\nWrite Results:\n"); + + ASSERT(runNames.size() == snapshotMetrics.size()); + ASSERT(runNames.size() == deltaMetrics.size()); + for (int i = 0; i < runNames.size(); i++) { + fmt::print("{0}", runNames[i]); + + printMetrics( + snapshotMetrics[i].first, snapshotMetrics[i].second, snapshotMetrics[i].first, snapshotMetrics[0].first); + printMetrics(deltaMetrics[i].first, deltaMetrics[i].second, deltaMetrics[i].first, deltaMetrics[0].first); + + int64_t logicalTotalBytes = snapshotMetrics[0].first + deltaMetrics[0].first; + int64_t totalBytes = deltaMetrics[i].first + snapshotMetrics[i].first; + double logicalTotalElapsed = (snapshotMetrics[i].second == 0.0 || deltaMetrics[i].second == 0.0) + ? 0.0 + : snapshotMetrics[i].second + deltaMetrics[i].second; + printMetrics(totalBytes, logicalTotalElapsed, deltaMetrics[i].first, logicalTotalBytes); + + fmt::print("\n"); + } + + std::vector readRunNames = {}; + std::vector> readMetrics; + + bool doEdgeCaseReadTests = true; + std::vector clearAllReadMetrics; + std::vector readSingleKeyMetrics; + + for (bool chunk : chunkModes) { + for (bool encrypt : encryptionModes) { + if (!chunk && encrypt) { + continue; + } + + Optional keys = encrypt ? cipherKeys : Optional(); + for (auto& compressionFilter : compressionModes) { + if (!chunk && compressionFilter.present()) { + continue; + } + if (compressionFilter.present() && CompressionFilter::NONE == compressionFilter.get()) { + continue; + } + std::string name; + if (!chunk) { + name = "old"; + } else { + if (encrypt) { + name += "ENC"; + } + if (compressionFilter.present()) { + name += "CMP"; + } + if (name.empty()) { + name = "chunked"; + } + } + readRunNames.push_back(name); + + int64_t totalBytesRead = 0; + double totalElapsed = 0.0; + double totalElapsedClearAll = 0.0; + double totalElapsedSingleKey = 0.0; + for (auto& fileSet : fileSets) { + FileSet newFileSet; + if (!chunk) { + newFileSet = fileSet; + } else { + newFileSet = rewriteChunkedFileSet(fileSet, keys, compressionFilter); + } + + auto res = doReadBench(newFileSet, chunk, fileSet.range, false, keys, compressionFilter); + totalBytesRead += res.first; + totalElapsed += res.second; + + if (doEdgeCaseReadTests) { + totalElapsedClearAll += + doReadBench(newFileSet, chunk, fileSet.range, true, keys, compressionFilter).second; + Key k = std::get<3>(fileSet.snapshotFile).front().key; + KeyRange singleKeyRange(KeyRangeRef(k, keyAfter(k))); + totalElapsedSingleKey += + doReadBench(newFileSet, chunk, singleKeyRange, false, keys, compressionFilter).second; + } + } + readMetrics.push_back({ totalBytesRead, totalElapsed }); + if (doEdgeCaseReadTests) { + clearAllReadMetrics.push_back(totalElapsedClearAll); + readSingleKeyMetrics.push_back(totalElapsedSingleKey); + } + } + } + } + + fmt::print("\n\nRead Results:\n"); + + ASSERT(readRunNames.size() == readMetrics.size()); + for (int i = 0; i < readRunNames.size(); i++) { + fmt::print("{0}", readRunNames[i]); + + double MBperCPUsec = (readMetrics[i].first / 1024.0 / 1024.0) / readMetrics[i].second; + fmt::print(" {:.6}", MBperCPUsec); + + fmt::print("\n"); + } + + if (doEdgeCaseReadTests) { + ASSERT(readRunNames.size() == clearAllReadMetrics.size()); + ASSERT(readRunNames.size() == readSingleKeyMetrics.size()); + fmt::print("\n\nEdge Case Read Results:\n"); + + for (int i = 0; i < readRunNames.size(); i++) { + fmt::print("{0}", readRunNames[i]); + + // use MB from full read test but elapsed from these tests so the numbers make sense relatively + double MBperCPUsecClearAll = (readMetrics[i].first / 1024.0 / 1024.0) / clearAllReadMetrics[i]; + double MBperCPUsecSingleKey = (readMetrics[i].first / 1024.0 / 1024.0) / readSingleKeyMetrics[i]; + fmt::print(" {:.6} {:.6}", MBperCPUsecClearAll, MBperCPUsecSingleKey); + + fmt::print("\n"); + } + } + + fmt::print("\n\nCombined Results:\n"); + ASSERT(readRunNames.size() == runNames.size() - 1); + for (int i = 0; i < readRunNames.size(); i++) { + fmt::print("{0}", readRunNames[i]); + int64_t logicalBytes = deltaMetrics[i + 1].first; + double totalElapsed = snapshotMetrics[i + 1].second + deltaMetrics[i + 1].second + readMetrics[i].second; + double MBperCPUsec = (logicalBytes / 1024.0 / 1024.0) / totalElapsed; + fmt::print(" {:.6}", MBperCPUsec); + + fmt::print("\n"); + } + + fmt::print("\n\nBenchmark Complete!\n"); + + return Void(); +} diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index e0f627a9da..583da353f7 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -31,13 +31,6 @@ #include "fdbclient/FDBTypes.h" #include "flow/actorcompiler.h" // This must be the last #include. -// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other -// sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything -// else is in 1 arena and discarded at the end. - -// TODO could refactor the file reading code from here and the delta file function into another actor, -// then this part would also be testable? but meh - ACTOR Future> readFile(Reference bstoreProvider, BlobFilePointerRef f) { try { state Arena arena; @@ -140,3 +133,66 @@ ACTOR Future readBlobGranules(BlobGranuleFileRequest request, return Void(); } + +// Return true if a given range is fully covered by blob chunks +bool isRangeFullyCovered(KeyRange range, Standalone> blobChunks) { + std::vector blobRanges; + for (const BlobGranuleChunkRef& chunk : blobChunks) { + blobRanges.push_back(chunk.keyRange); + } + + return range.isCovered(blobRanges); +} + +void testAddChunkRange(KeyRef begin, KeyRef end, Standalone>& chunks) { + BlobGranuleChunkRef chunk; + chunk.keyRange = KeyRangeRef(begin, end); + chunks.push_back(chunks.arena(), chunk); +} + +TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") { + Standalone> chunks; + // chunk1 key_a1 - key_a9 + testAddChunkRange("key_a1"_sr, "key_a9"_sr, chunks); + // chunk2 key_b1 - key_b9 + testAddChunkRange("key_b1"_sr, "key_b9"_sr, chunks); + + // check empty range. not covered + { ASSERT(isRangeFullyCovered(KeyRangeRef(), chunks) == false); } + + // check empty chunks. not covered + { + Standalone> empyChunks; + ASSERT(isRangeFullyCovered(KeyRangeRef(), empyChunks) == false); + } + + // check '' to \xff + { ASSERT(isRangeFullyCovered(KeyRangeRef(""_sr, "\xff"_sr), chunks) == false); } + + // check {key_a1, key_a9} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a9"_sr), chunks)); } + + // check {key_a1, key_a3} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a3"_sr), chunks)); } + + // check {key_a0, key_a3} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a0"_sr, "key_a3"_sr), chunks) == false); } + + // check {key_a5, key_b2} + { + auto range = KeyRangeRef("key_a5"_sr, "key_b5"_sr); + ASSERT(isRangeFullyCovered(range, chunks) == false); + ASSERT(range.begin == "key_a5"_sr); + ASSERT(range.end == "key_b5"_sr); + } + + // check continued chunks + { + Standalone> continuedChunks; + testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks); + testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks); + testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks); + ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false); + } + return Void(); +} diff --git a/fdbclient/BuildFlags.h.in b/fdbclient/BuildFlags.h.in index 6f94c540f8..b55c7e55dd 100644 --- a/fdbclient/BuildFlags.h.in +++ b/fdbclient/BuildFlags.h.in @@ -33,6 +33,9 @@ #define C_VERSION_MINOR 0 #endif +const char* kDate = __DATE__; +const char* kTime = __TIME__; + // FDB info. const std::string kGitHash = "@CURRENT_GIT_VERSION_WNL@"; const std::string kFdbVersion = "@FDB_VERSION@"; @@ -43,7 +46,7 @@ const std::string kArch = "@CMAKE_SYSTEM@"; const std::string kCompiler = "@CMAKE_CXX_COMPILER_ID@"; // Library versions. -const std::string kBoostVersion = "@Boost_LIB_VERSION@"; +const std::string kBoostVersion = BOOST_LIB_VERSION; // Build info and flags. const std::string kCMakeVersion = "@CMAKE_VERSION@"; @@ -61,6 +64,9 @@ std::string jsonBuildInformation() { json_spirit::mValue json; JSONDoc doc(json); + doc.create("build_date") = kDate; + doc.create("build_time") = kTime; + doc.create("git_hash") = kGitHash; doc.create("fdb_version") = kFdbVersion; diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index ab0e5b3931..2953a360e7 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -1,8 +1,5 @@ fdb_find_sources(FDBCLIENT_SRCS) -list(APPEND FDBCLIENT_SRCS - sha1/SHA1.cpp - libb64/cdecode.c - libb64/cencode.c) +list(APPEND FDBCLIENT_SRCS sha1/SHA1.cpp) message(STATUS "FDB version is ${FDB_VERSION}") message(STATUS "FDB package name is ${FDB_PACKAGE_NAME}") @@ -71,13 +68,6 @@ if(WITH_AWS_BACKUP) include(awssdk) endif() -find_package(ZLIB) -if(ZLIB_FOUND) - add_compile_definitions(ZLIB_LIB_SUPPORTED) -else() - message(STATUS "ZLIB package not found") -endif() - add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs}) target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include") configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h) @@ -100,8 +90,8 @@ add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS LinkTest.cpp) target_link_libraries(fdbclientlinktest PRIVATE fdbclient rapidxml) # re-link rapidxml due to private link interface if(BUILD_AZURE_BACKUP) - target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite) - target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite) + target_link_libraries(fdbclient PRIVATE curl azure-storage-lite) + target_link_libraries(fdbclient_sampling PRIVATE curl azure-storage-lite) endif() if(BUILD_AWS_BACKUP) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 20d48846cc..b15f7c9583 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -22,6 +22,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/SystemData.h" #include "fdbclient/Tenant.h" +#include "flow/IRandom.h" #include "flow/UnitTest.h" #define init(...) KNOB_FN(__VA_ARGS__, INIT_ATOMIC_KNOB, INIT_KNOB)(__VA_ARGS__) @@ -41,10 +42,6 @@ void ClientKnobs::initialize(Randomize randomize) { init( FAILURE_MAX_DELAY, 5.0 ); init( FAILURE_MIN_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_MIN_DELAY = 1.0; - init( FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY ); - init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY ); - init( FAILURE_EMERGENCY_DELAY, 30.0 ); - init( FAILURE_MAX_GENERATIONS, 10 ); init( RECOVERY_DELAY_START_GENERATION, 70 ); init( RECOVERY_DELAY_SECONDS_PER_GENERATION, 60.0 ); init( MAX_GENERATIONS, 100 ); @@ -63,6 +60,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + init( GRV_ERROR_RETRY_DELAY, 5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01(); init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01(); init( REPLY_BYTE_LIMIT, 80000 ); init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01(); @@ -81,9 +79,9 @@ void ClientKnobs::initialize(Randomize randomize) { init( METADATA_VERSION_CACHE_SIZE, 1000 ); init( CHANGE_FEED_LOCATION_LIMIT, 10000 ); init( CHANGE_FEED_CACHE_SIZE, 100000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_SIZE = 1; - init( CHANGE_FEED_POP_TIMEOUT, 5.0 ); + init( CHANGE_FEED_POP_TIMEOUT, 10.0 ); init( CHANGE_FEED_STREAM_MIN_BYTES, 1e4 ); if( randomize && BUGGIFY ) CHANGE_FEED_STREAM_MIN_BYTES = 1; - init( TENANT_PREFIX_SIZE_LIMIT, 28 ); ASSERT(TENANT_PREFIX_SIZE_LIMIT >= TenantMapEntry::ROOT_PREFIX_SIZE); // includes 8-byte ID and optional tenant subspace + init( CHANGE_FEED_START_INTERVAL, 10.0 ); init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; @@ -111,7 +109,6 @@ void ClientKnobs::initialize(Randomize randomize) { init( RANGESTREAM_BUFFERED_FRAGMENTS_LIMIT, 20 ); init( QUARANTINE_TSS_ON_MISMATCH, true ); if( randomize && BUGGIFY ) QUARANTINE_TSS_ON_MISMATCH = false; // if true, a tss mismatch will put the offending tss in quarantine. If false, it will just be killed init( CHANGE_FEED_EMPTY_BATCH_TIME, 0.005 ); - init( SHARD_ENCODE_LOCATION_METADATA, false ); if( randomize && BUGGIFY ) SHARD_ENCODE_LOCATION_METADATA = true; //KeyRangeMap init( KRM_GET_RANGE_LIMIT, 1e5 ); if( randomize && BUGGIFY ) KRM_GET_RANGE_LIMIT = 10; @@ -160,8 +157,6 @@ void ClientKnobs::initialize(Randomize randomize) { init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60); init( BACKUP_AGGREGATE_POLL_RATE, 2.0 ); // polls per second target for all agents on the cluster init( BACKUP_LOG_WRITE_BATCH_MAX_SIZE, 1e6 ); //Must be much smaller than TRANSACTION_SIZE_LIMIT - init( BACKUP_LOG_ATOMIC_OPS_SIZE, 1000 ); - init( BACKUP_OPERATION_COST_OVERHEAD, 50 ); init( BACKUP_MAX_LOG_RANGES, 21 ); if( randomize && BUGGIFY ) BACKUP_MAX_LOG_RANGES = 4; init( BACKUP_SIM_COPY_LOG_RANGES, 100 ); init( BACKUP_VERSION_DELAY, 5*CORE_VERSIONSPERSECOND ); @@ -204,14 +199,13 @@ void ClientKnobs::initialize(Randomize randomize) { init( DEFAULT_COMMIT_GRV_PROXIES_RATIO, 3 ); init( DEFAULT_MAX_GRV_PROXIES, 4 ); + init( GLOBAL_CONFIG_REFRESH_BACKOFF, 0.5 ); + init( GLOBAL_CONFIG_REFRESH_MAX_BACKOFF, 60.0 ); + init( GLOBAL_CONFIG_REFRESH_TIMEOUT, 10.0 ); + init( IS_ACCEPTABLE_DELAY, 1.5 ); - init( HTTP_READ_SIZE, 128*1024 ); - init( HTTP_SEND_SIZE, 32*1024 ); - init( HTTP_VERBOSE_LEVEL, 0 ); - init( HTTP_REQUEST_ID_HEADER, "" ); init( HTTP_REQUEST_AWS_V4_HEADER, true ); - init( HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT, false ); init( BLOBSTORE_ENCRYPTION_TYPE, "" ); init( BLOBSTORE_CONNECT_TRIES, 10 ); init( BLOBSTORE_CONNECT_TIMEOUT, 10 ); @@ -270,27 +264,36 @@ void ClientKnobs::initialize(Randomize randomize) { init( MAX_TAGS_PER_TRANSACTION, 5 ); init( MAX_TRANSACTION_TAG_LENGTH, 16 ); init( COMMIT_SAMPLE_COST, 100 ); if( randomize && BUGGIFY ) COMMIT_SAMPLE_COST = 10; - init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096; init( INCOMPLETE_SHARD_PLUS, 4096 ); init( READ_TAG_SAMPLE_RATE, 0.01 ); if( randomize && BUGGIFY ) READ_TAG_SAMPLE_RATE = 1.0; // Communicated to clients from cluster init( TAG_THROTTLE_SMOOTHING_WINDOW, 2.0 ); init( TAG_THROTTLE_RECHECK_INTERVAL, 5.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_RECHECK_INTERVAL = 0.0; init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0; + init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096; + init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096; // busyness reporting init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 ); init( BUSYNESS_SPIKE_SATURATED_THRESHOLD, 0.500 ); - // multi-version client control - init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 ); - init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 ); - // Blob granules init( BG_MAX_GRANULE_PARALLELISM, 10 ); + init( BG_TOO_MANY_GRANULES, 10000 ); init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 ); init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 ); + // Tenants and Metacluster + init( MAX_TENANTS_PER_CLUSTER, 1e6 ); + init( TENANT_TOMBSTONE_CLEANUP_INTERVAL, 60 ); if ( randomize && BUGGIFY ) TENANT_TOMBSTONE_CLEANUP_INTERVAL = deterministicRandom()->random01() * 30; + init( MAX_DATA_CLUSTERS, 1e5 ); + init( REMOVE_CLUSTER_TENANT_BATCH_SIZE, 1e4 ); if ( randomize && BUGGIFY ) REMOVE_CLUSTER_TENANT_BATCH_SIZE = 1; + init( METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, 5 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK = 1; + init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60; + init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59; + init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + + init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING, false ); // clang-format on } diff --git a/fdbclient/ConfigKnobs.cpp b/fdbclient/ConfigKnobs.cpp index 03b4b09fbd..283e8208c2 100644 --- a/fdbclient/ConfigKnobs.cpp +++ b/fdbclient/ConfigKnobs.cpp @@ -144,10 +144,7 @@ std::string configDBTypeToString(ConfigDBType configDBType) { } TEST_CASE("/fdbclient/ConfigDB/ConfigKey/EncodeDecode") { - Tuple tuple; - tuple << "class-A"_sr - << "test_long"_sr; - auto packed = tuple.pack(); + auto packed = Tuple::makeTuple("class-A"_sr, "test_long"_sr).pack(); auto unpacked = ConfigKeyRef::decodeKey(packed); ASSERT(unpacked.configClass.get() == "class-A"_sr); ASSERT(unpacked.knobName == "test_long"_sr); @@ -169,18 +166,8 @@ void decodeFailureTest(KeyRef key) { } // namespace TEST_CASE("/fdbclient/ConfigDB/ConfigKey/DecodeFailure") { - { - Tuple tuple; - tuple << "s1"_sr - << "s2"_sr - << "s3"_sr; - decodeFailureTest(tuple.pack()); - } - { - Tuple tuple; - tuple << "s1"_sr << 5; - decodeFailureTest(tuple.pack()); - } + decodeFailureTest(Tuple::makeTuple("s1"_sr, "s2"_sr, "s3"_sr).pack()); + decodeFailureTest(Tuple::makeTuple("s1"_sr, 5).pack()); decodeFailureTest("non-tuple-key"_sr); return Void(); } diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp index f549205e25..548f03c46c 100644 --- a/fdbclient/DatabaseBackupAgent.actor.cpp +++ b/fdbclient/DatabaseBackupAgent.actor.cpp @@ -37,11 +37,11 @@ #include "flow/actorcompiler.h" // has to be last include -const Key DatabaseBackupAgent::keyAddPrefix = LiteralStringRef("add_prefix"); -const Key DatabaseBackupAgent::keyRemovePrefix = LiteralStringRef("remove_prefix"); -const Key DatabaseBackupAgent::keyRangeVersions = LiteralStringRef("range_versions"); -const Key DatabaseBackupAgent::keyCopyStop = LiteralStringRef("copy_stop"); -const Key DatabaseBackupAgent::keyDatabasesInSync = LiteralStringRef("databases_in_sync"); +const Key DatabaseBackupAgent::keyAddPrefix = "add_prefix"_sr; +const Key DatabaseBackupAgent::keyRemovePrefix = "remove_prefix"_sr; +const Key DatabaseBackupAgent::keyRangeVersions = "range_versions"_sr; +const Key DatabaseBackupAgent::keyCopyStop = "copy_stop"_sr; +const Key DatabaseBackupAgent::keyDatabasesInSync = "databases_in_sync"_sr; const int DatabaseBackupAgent::LATEST_DR_VERSION = 1; DatabaseBackupAgent::DatabaseBackupAgent() @@ -75,14 +75,13 @@ DatabaseBackupAgent::DatabaseBackupAgent(Database src) class DRConfig { public: DRConfig(UID uid = UID()) - : uid(uid), - configSpace(uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(databaseBackupPrefixRange.begin), uid)) {} + : uid(uid), configSpace(uidPrefixKey("uid->config/"_sr.withPrefix(databaseBackupPrefixRange.begin), uid)) {} DRConfig(Reference task) : DRConfig(BinaryReader::fromStringRef(task->params[BackupAgentBase::keyConfigLogUid], Unversioned())) {} - KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(__FUNCTION__sr); } void clear(Reference tr) { tr->clear(configSpace.range()); } @@ -137,7 +136,7 @@ struct BackupRangeTaskFunc : TaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam bytesWritten() { return LiteralStringRef(__FUNCTION__); } + static TaskParam bytesWritten() { return __FUNCTION__sr; } } Params; static const Key keyAddBackupRangeTasks; @@ -203,7 +202,7 @@ struct BackupRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _execute(Database cx, @@ -363,7 +362,7 @@ struct BackupRangeTaskFunc : TaskFuncBase { if ((!prevAdjacent || !nextAdjacent) && rangeCount > ((prevAdjacent || nextAdjacent) ? CLIENT_KNOBS->BACKUP_MAP_KEY_UPPER_LIMIT : CLIENT_KNOBS->BACKUP_MAP_KEY_LOWER_LIMIT)) { - TEST(true); // range insert delayed because too versionMap is too large + CODE_PROBE(true, "range insert delayed because too versionMap is too large"); if (rangeCount > CLIENT_KNOBS->BACKUP_MAP_KEY_UPPER_LIMIT) TraceEvent(SevWarnAlways, "DBA_KeyRangeMapTooLarge").log(); @@ -405,10 +404,10 @@ struct BackupRangeTaskFunc : TaskFuncBase { break; if (backupVersions.get()[versionLoc + 1].key == - (removePrefix == StringRef() ? normalKeys.end : strinc(removePrefix))) { + (removePrefix == StringRef() ? allKeys.end : strinc(removePrefix))) { tr->clear(KeyRangeRef( backupVersions.get()[versionLoc].key.removePrefix(removePrefix).withPrefix(addPrefix), - addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix))); + addPrefix == StringRef() ? allKeys.end : strinc(addPrefix))); } else { tr->clear(KeyRangeRef(backupVersions.get()[versionLoc].key, backupVersions.get()[versionLoc + 1].key) @@ -536,9 +535,9 @@ struct BackupRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef BackupRangeTaskFunc::name = LiteralStringRef("dr_backup_range"); -const Key BackupRangeTaskFunc::keyAddBackupRangeTasks = LiteralStringRef("addBackupRangeTasks"); -const Key BackupRangeTaskFunc::keyBackupRangeBeginKey = LiteralStringRef("backupRangeBeginKey"); +StringRef BackupRangeTaskFunc::name = "dr_backup_range"_sr; +const Key BackupRangeTaskFunc::keyAddBackupRangeTasks = "addBackupRangeTasks"_sr; +const Key BackupRangeTaskFunc::keyBackupRangeBeginKey = "backupRangeBeginKey"_sr; REGISTER_TASKFUNC(BackupRangeTaskFunc); struct FinishFullBackupTaskFunc : TaskFuncBase { @@ -588,7 +587,7 @@ struct FinishFullBackupTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -606,7 +605,7 @@ struct FinishFullBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef FinishFullBackupTaskFunc::name = LiteralStringRef("dr_finish_full_backup"); +StringRef FinishFullBackupTaskFunc::name = "dr_finish_full_backup"_sr; REGISTER_TASKFUNC(FinishFullBackupTaskFunc); struct EraseLogRangeTaskFunc : TaskFuncBase { @@ -683,7 +682,7 @@ struct EraseLogRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -697,7 +696,7 @@ struct EraseLogRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("dr_erase_log_range"); +StringRef EraseLogRangeTaskFunc::name = "dr_erase_log_range"_sr; REGISTER_TASKFUNC(EraseLogRangeTaskFunc); struct CopyLogRangeTaskFunc : TaskFuncBase { @@ -705,7 +704,7 @@ struct CopyLogRangeTaskFunc : TaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam bytesWritten() { return LiteralStringRef(__FUNCTION__); } + static TaskParam bytesWritten() { return __FUNCTION__sr; } } Params; static const Key keyNextBeginVersion; @@ -958,7 +957,7 @@ struct CopyLogRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -989,8 +988,8 @@ struct CopyLogRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef CopyLogRangeTaskFunc::name = LiteralStringRef("dr_copy_log_range"); -const Key CopyLogRangeTaskFunc::keyNextBeginVersion = LiteralStringRef("nextBeginVersion"); +StringRef CopyLogRangeTaskFunc::name = "dr_copy_log_range"_sr; +const Key CopyLogRangeTaskFunc::keyNextBeginVersion = "nextBeginVersion"_sr; REGISTER_TASKFUNC(CopyLogRangeTaskFunc); struct CopyLogsTaskFunc : TaskFuncBase { @@ -1125,7 +1124,7 @@ struct CopyLogsTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -1143,7 +1142,7 @@ struct CopyLogsTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef CopyLogsTaskFunc::name = LiteralStringRef("dr_copy_logs"); +StringRef CopyLogsTaskFunc::name = "dr_copy_logs"_sr; REGISTER_TASKFUNC(CopyLogsTaskFunc); struct FinishedFullBackupTaskFunc : TaskFuncBase { @@ -1235,7 +1234,7 @@ struct FinishedFullBackupTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -1283,8 +1282,8 @@ struct FinishedFullBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef FinishedFullBackupTaskFunc::name = LiteralStringRef("dr_finished_full_backup"); -const Key FinishedFullBackupTaskFunc::keyInsertTask = LiteralStringRef("insertTask"); +StringRef FinishedFullBackupTaskFunc::name = "dr_finished_full_backup"_sr; +const Key FinishedFullBackupTaskFunc::keyInsertTask = "insertTask"_sr; REGISTER_TASKFUNC(FinishedFullBackupTaskFunc); struct CopyDiffLogsTaskFunc : TaskFuncBase { @@ -1396,7 +1395,7 @@ struct CopyDiffLogsTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -1414,7 +1413,7 @@ struct CopyDiffLogsTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef CopyDiffLogsTaskFunc::name = LiteralStringRef("dr_copy_diff_logs"); +StringRef CopyDiffLogsTaskFunc::name = "dr_copy_diff_logs"_sr; REGISTER_TASKFUNC(CopyDiffLogsTaskFunc); // Skip unneeded EraseLogRangeTaskFunc in 5.1 @@ -1446,7 +1445,7 @@ struct SkipOldEraseLogRangeTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef SkipOldEraseLogRangeTaskFunc::name = LiteralStringRef("dr_skip_legacy_task"); +StringRef SkipOldEraseLogRangeTaskFunc::name = "dr_skip_legacy_task"_sr; REGISTER_TASKFUNC(SkipOldEraseLogRangeTaskFunc); REGISTER_TASKFUNC_ALIAS(SkipOldEraseLogRangeTaskFunc, db_erase_log_range); @@ -1456,7 +1455,7 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam bytesWritten() { return LiteralStringRef(__FUNCTION__); } + static TaskParam bytesWritten() { return __FUNCTION__sr; } } Params; static const Key keyNextBeginVersion; @@ -1652,7 +1651,7 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } ACTOR static Future _finish(Reference tr, @@ -1683,8 +1682,8 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase { return Void(); } }; -StringRef OldCopyLogRangeTaskFunc::name = LiteralStringRef("db_copy_log_range"); -const Key OldCopyLogRangeTaskFunc::keyNextBeginVersion = LiteralStringRef("nextBeginVersion"); +StringRef OldCopyLogRangeTaskFunc::name = "db_copy_log_range"_sr; +const Key OldCopyLogRangeTaskFunc::keyNextBeginVersion = "nextBeginVersion"_sr; REGISTER_TASKFUNC(OldCopyLogRangeTaskFunc); struct AbortOldBackupTaskFunc : TaskFuncBase { @@ -1753,7 +1752,7 @@ struct AbortOldBackupTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -1771,7 +1770,7 @@ struct AbortOldBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef AbortOldBackupTaskFunc::name = LiteralStringRef("dr_abort_legacy_backup"); +StringRef AbortOldBackupTaskFunc::name = "dr_abort_legacy_backup"_sr; REGISTER_TASKFUNC(AbortOldBackupTaskFunc); REGISTER_TASKFUNC_ALIAS(AbortOldBackupTaskFunc, db_backup_range); REGISTER_TASKFUNC_ALIAS(AbortOldBackupTaskFunc, db_finish_full_backup); @@ -1834,13 +1833,16 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase { return Void(); } - if (backupRanges.size() == 1) { + if (backupRanges.size() == 1 || isDefaultBackup(backupRanges)) { RangeResult existingDestUidValues = wait(srcTr->getRange( KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); bool found = false; + KeyRangeRef targetRange = + (backupRanges.size() == 1) ? backupRanges[0] : getDefaultBackupSharedRange(); for (auto it : existingDestUidValues) { - if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), - IncludeVersion()) == backupRanges[0]) { + KeyRange uidRange = BinaryReader::fromStringRef( + it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + if (uidRange == targetRange) { if (destUidValue != it.value) { // existing backup/DR is running return Void(); @@ -1856,7 +1858,7 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase { } srcTr->set( - BinaryWriter::toValue(backupRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())) + BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations())) .withPrefix(destUidLookupPrefix), destUidValue); } @@ -1918,7 +1920,7 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef CopyDiffLogsUpgradeTaskFunc::name = LiteralStringRef("db_copy_diff_logs"); +StringRef CopyDiffLogsUpgradeTaskFunc::name = "db_copy_diff_logs"_sr; REGISTER_TASKFUNC(CopyDiffLogsUpgradeTaskFunc); struct BackupRestorableTaskFunc : TaskFuncBase { @@ -2031,7 +2033,7 @@ struct BackupRestorableTaskFunc : TaskFuncBase { task, parentTask->params[Task::reservedTaskParamValidKey], task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -2049,7 +2051,7 @@ struct BackupRestorableTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef BackupRestorableTaskFunc::name = LiteralStringRef("dr_backup_restorable"); +StringRef BackupRestorableTaskFunc::name = "dr_backup_restorable"_sr; REGISTER_TASKFUNC(BackupRestorableTaskFunc); struct StartFullBackupTaskFunc : TaskFuncBase { @@ -2078,24 +2080,29 @@ struct StartFullBackupTaskFunc : TaskFuncBase { srcTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); // Initialize destUid - if (backupRanges.size() == 1) { + if (backupRanges.size() == 1 || isDefaultBackup(backupRanges)) { RangeResult existingDestUidValues = wait(srcTr->getRange( KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); + KeyRangeRef targetRange = + (backupRanges.size() == 1) ? backupRanges[0] : getDefaultBackupSharedRange(); bool found = false; for (auto it : existingDestUidValues) { - if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), - IncludeVersion()) == backupRanges[0]) { + KeyRange uidRange = BinaryReader::fromStringRef( + it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + if (uidRange == targetRange) { destUidValue = it.value; found = true; + CODE_PROBE(targetRange == getDefaultBackupSharedRange(), + "DR mutation sharing with default backup"); break; } } if (!found) { destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned()); - srcTr->set(BinaryWriter::toValue(backupRanges[0], - IncludeVersion(ProtocolVersion::withSharedMutations())) - .withPrefix(destUidLookupPrefix), - destUidValue); + srcTr->set( + BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations())) + .withPrefix(destUidLookupPrefix), + destUidValue); } } @@ -2281,7 +2288,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase { task->params[BackupAgentBase::keyConfigBackupRanges] = keyConfigBackupRanges; task->params[BackupAgentBase::keyTagName] = tagName; task->params[DatabaseBackupAgent::keyDatabasesInSync] = - backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? LiteralStringRef("t") : LiteralStringRef("f"); + backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? "t"_sr : "f"_sr; if (!waitFor) { return taskBucket->addTask(tr, @@ -2301,7 +2308,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase { .get(logUid) .pack(BackupAgentBase::keyFolderId), task->params[BackupAgentBase::keyFolderId])); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -2319,7 +2326,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef StartFullBackupTaskFunc::name = LiteralStringRef("dr_start_full_backup"); +StringRef StartFullBackupTaskFunc::name = "dr_start_full_backup"_sr; REGISTER_TASKFUNC(StartFullBackupTaskFunc); } // namespace dbBackup @@ -2625,7 +2632,7 @@ public: int64_t startCount = 0; state Key mapPrefix = logUidValue.withPrefix(applyMutationsKeyVersionMapRange.begin); - Key mapEnd = normalKeys.end.withPrefix(mapPrefix); + Key mapEnd = allKeys.end.withPrefix(mapPrefix); tr->set(logUidValue.withPrefix(applyMutationsAddPrefixRange.begin), addPrefix); tr->set(logUidValue.withPrefix(applyMutationsRemovePrefixRange.begin), removePrefix); tr->set(logUidValue.withPrefix(applyMutationsKeyVersionCountRange.begin), StringRef((uint8_t*)&startCount, 8)); @@ -2780,7 +2787,7 @@ public: Version destVersion = wait(tr3.getReadVersion()); TraceEvent("DBA_SwitchoverVersionUpgrade").detail("Src", commitVersion).detail("Dest", destVersion); if (destVersion <= commitVersion) { - TEST(true); // Forcing dest backup cluster to higher version + CODE_PROBE(true, "Forcing dest backup cluster to higher version"); tr3.set(minRequiredCommitVersionKey, BinaryWriter::toValue(commitVersion + 1, Unversioned())); wait(tr3.commit()); } else { @@ -2933,7 +2940,7 @@ public: Version applied = BinaryReader::fromStringRef(lastApplied.get(), Unversioned()); TraceEvent("DBA_AbortVersionUpgrade").detail("Src", applied).detail("Dest", current); if (current <= applied) { - TEST(true); // Upgrading version of local database. + CODE_PROBE(true, "Upgrading version of local database."); // The +1 is because we want to make sure that a versionstamped operation can't reuse // the same version as an already-applied transaction. tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(applied + 1, Unversioned())); @@ -3061,6 +3068,9 @@ public: loop { try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + wait(success(tr->getReadVersion())); // get the read version before getting a version from the source // database to prevent the time differential from going negative @@ -3072,9 +3082,6 @@ public: state UID logUid = wait(backupAgent->getLogUid(tr, tagName)); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Future> fPaused = tr->get(backupAgent->taskBucket->getPauseKey()); state Future fErrorValues = errorLimit > 0 diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp index f19782346a..76fded095c 100644 --- a/fdbclient/DatabaseConfiguration.cpp +++ b/fdbclient/DatabaseConfiguration.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/FDBTypes.h" #include "fdbclient/SystemData.h" #include "flow/ITrace.h" #include "flow/Trace.h" @@ -53,6 +54,7 @@ void DatabaseConfiguration::resetInternal() { storageMigrationType = StorageMigrationType::DEFAULT; blobGranulesEnabled = false; tenantMode = TenantMode::DISABLED; + encryptionAtRestMode = EncryptionAtRestMode::DISABLED; } int toInt(ValueRef const& v) { @@ -64,6 +66,16 @@ void parse(int* i, ValueRef const& v) { *i = atoi(v.toString().c_str()); } +void parse(int64_t* i, ValueRef const& v) { + // FIXME: Sanity checking + *i = atoll(v.toString().c_str()); +} + +void parse(double* i, ValueRef const& v) { + // FIXME: Sanity checking + *i = atof(v.toString().c_str()); +} + void parseReplicationPolicy(Reference* policy, ValueRef const& v) { BinaryReader reader(v, IncludeVersion()); serializeReplicationPolicy(reader, *policy); @@ -213,7 +225,8 @@ bool DatabaseConfiguration::isValid() const { (perpetualStorageWiggleSpeed == 0 || perpetualStorageWiggleSpeed == 1) && isValidPerpetualStorageWiggleLocality(perpetualStorageWiggleLocality) && storageMigrationType != StorageMigrationType::UNSET && tenantMode >= TenantMode::DISABLED && - tenantMode < TenantMode::END)) { + tenantMode < TenantMode::END && encryptionAtRestMode >= EncryptionAtRestMode::DISABLED && + encryptionAtRestMode < EncryptionAtRestMode::END)) { return false; } std::set dcIds; @@ -413,6 +426,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const { result["storage_migration_type"] = storageMigrationType.toString(); result["blob_granules_enabled"] = (int32_t)blobGranulesEnabled; result["tenant_mode"] = tenantMode.toString(); + result["encryption_at_rest_mode"] = encryptionAtRestMode.toString(); return result; } @@ -546,38 +560,38 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { KeyRef ck = key.removePrefix(configKeysPrefix); int type; - if (ck == LiteralStringRef("initialized")) { + if (ck == "initialized"_sr) { initialized = true; - } else if (ck == LiteralStringRef("commit_proxies")) { + } else if (ck == "commit_proxies"_sr) { commitProxyCount = toInt(value); if (commitProxyCount == -1) overwriteProxiesCount(); - } else if (ck == LiteralStringRef("grv_proxies")) { + } else if (ck == "grv_proxies"_sr) { grvProxyCount = toInt(value); if (grvProxyCount == -1) overwriteProxiesCount(); - } else if (ck == LiteralStringRef("resolvers")) { + } else if (ck == "resolvers"_sr) { parse(&resolverCount, value); - } else if (ck == LiteralStringRef("logs")) { + } else if (ck == "logs"_sr) { parse(&desiredTLogCount, value); - } else if (ck == LiteralStringRef("log_replicas")) { + } else if (ck == "log_replicas"_sr) { parse(&tLogReplicationFactor, value); tLogWriteAntiQuorum = std::min(tLogWriteAntiQuorum, tLogReplicationFactor / 2); - } else if (ck == LiteralStringRef("log_anti_quorum")) { + } else if (ck == "log_anti_quorum"_sr) { parse(&tLogWriteAntiQuorum, value); if (tLogReplicationFactor > 0) { tLogWriteAntiQuorum = std::min(tLogWriteAntiQuorum, tLogReplicationFactor / 2); } - } else if (ck == LiteralStringRef("storage_replicas")) { + } else if (ck == "storage_replicas"_sr) { parse(&storageTeamSize, value); - } else if (ck == LiteralStringRef("tss_count")) { + } else if (ck == "tss_count"_sr) { parse(&desiredTSSCount, value); - } else if (ck == LiteralStringRef("log_version")) { + } else if (ck == "log_version"_sr) { parse((&type), value); type = std::max((int)TLogVersion::MIN_RECRUITABLE, type); type = std::min((int)TLogVersion::MAX_SUPPORTED, type); tLogVersion = (TLogVersion::Version)type; - } else if (ck == LiteralStringRef("log_engine")) { + } else if (ck == "log_engine"_sr) { parse((&type), value); tLogDataStoreType = (KeyValueStoreType::StoreType)type; // TODO: Remove this once Redwood works as a log engine @@ -588,61 +602,63 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { if (tLogDataStoreType == KeyValueStoreType::MEMORY_RADIXTREE) { tLogDataStoreType = KeyValueStoreType::SSD_BTREE_V2; } - } else if (ck == LiteralStringRef("log_spill")) { + } else if (ck == "log_spill"_sr) { parse((&type), value); tLogSpillType = (TLogSpillType::SpillType)type; - } else if (ck == LiteralStringRef("storage_engine")) { + } else if (ck == "storage_engine"_sr) { parse((&type), value); storageServerStoreType = (KeyValueStoreType::StoreType)type; - } else if (ck == LiteralStringRef("tss_storage_engine")) { + } else if (ck == "tss_storage_engine"_sr) { parse((&type), value); testingStorageServerStoreType = (KeyValueStoreType::StoreType)type; - } else if (ck == LiteralStringRef("auto_commit_proxies")) { + } else if (ck == "auto_commit_proxies"_sr) { parse(&autoCommitProxyCount, value); - } else if (ck == LiteralStringRef("auto_grv_proxies")) { + } else if (ck == "auto_grv_proxies"_sr) { parse(&autoGrvProxyCount, value); - } else if (ck == LiteralStringRef("auto_resolvers")) { + } else if (ck == "auto_resolvers"_sr) { parse(&autoResolverCount, value); - } else if (ck == LiteralStringRef("auto_logs")) { + } else if (ck == "auto_logs"_sr) { parse(&autoDesiredTLogCount, value); - } else if (ck == LiteralStringRef("storage_replication_policy")) { + } else if (ck == "storage_replication_policy"_sr) { parseReplicationPolicy(&storagePolicy, value); - } else if (ck == LiteralStringRef("log_replication_policy")) { + } else if (ck == "log_replication_policy"_sr) { parseReplicationPolicy(&tLogPolicy, value); - } else if (ck == LiteralStringRef("log_routers")) { + } else if (ck == "log_routers"_sr) { parse(&desiredLogRouterCount, value); - } else if (ck == LiteralStringRef("remote_logs")) { + } else if (ck == "remote_logs"_sr) { parse(&remoteDesiredTLogCount, value); - } else if (ck == LiteralStringRef("remote_log_replicas")) { + } else if (ck == "remote_log_replicas"_sr) { parse(&remoteTLogReplicationFactor, value); - } else if (ck == LiteralStringRef("remote_log_policy")) { + } else if (ck == "remote_log_policy"_sr) { parseReplicationPolicy(&remoteTLogPolicy, value); - } else if (ck == LiteralStringRef("backup_worker_enabled")) { + } else if (ck == "backup_worker_enabled"_sr) { parse((&type), value); backupWorkerEnabled = (type != 0); - } else if (ck == LiteralStringRef("usable_regions")) { + } else if (ck == "usable_regions"_sr) { parse(&usableRegions, value); - } else if (ck == LiteralStringRef("repopulate_anti_quorum")) { + } else if (ck == "repopulate_anti_quorum"_sr) { parse(&repopulateRegionAntiQuorum, value); - } else if (ck == LiteralStringRef("regions")) { + } else if (ck == "regions"_sr) { parse(®ions, value); - } else if (ck == LiteralStringRef("perpetual_storage_wiggle")) { + } else if (ck == "perpetual_storage_wiggle"_sr) { parse(&perpetualStorageWiggleSpeed, value); - } else if (ck == LiteralStringRef("perpetual_storage_wiggle_locality")) { + } else if (ck == "perpetual_storage_wiggle_locality"_sr) { if (!isValidPerpetualStorageWiggleLocality(value.toString())) { return false; } perpetualStorageWiggleLocality = value.toString(); - } else if (ck == LiteralStringRef("storage_migration_type")) { + } else if (ck == "storage_migration_type"_sr) { parse((&type), value); storageMigrationType = (StorageMigrationType::MigrationType)type; - } else if (ck == LiteralStringRef("tenant_mode")) { + } else if (ck == "tenant_mode"_sr) { tenantMode = TenantMode::fromValue(value); - } else if (ck == LiteralStringRef("proxies")) { + } else if (ck == "proxies"_sr) { overwriteProxiesCount(); - } else if (ck == LiteralStringRef("blob_granules_enabled")) { + } else if (ck == "blob_granules_enabled"_sr) { parse((&type), value); blobGranulesEnabled = (type != 0); + } else if (ck == "encryption_at_rest_mode"_sr) { + encryptionAtRestMode = EncryptionAtRestMode::fromValue(value); } else { return false; } diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 333cff9a81..bd34eedb43 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -21,14 +21,27 @@ #include "fmt/format.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" +#include "fdbclient/BlobCipher.h" #include "fdbclient/DatabaseContext.h" +#include "fdbclient/GetEncryptCipherKeys.actor.h" +#include "fdbclient/JsonBuilder.h" +#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/RestoreInterface.h" #include "fdbclient/Status.h" #include "fdbclient/SystemData.h" -#include "fdbclient/KeyBackedTypes.h" -#include "fdbclient/JsonBuilder.h" +#include "fdbclient/Tenant.h" +#include "fdbclient/TenantEntryCache.actor.h" + +#include "flow/Arena.h" +#include "flow/CodeProbe.h" +#include "flow/EncryptUtils.h" +#include "flow/network.h" +#include "flow/ObjectSerializer.h" +#include "flow/ProtocolVersion.h" +#include "flow/serialize.h" +#include "flow/Trace.h" #include #include @@ -36,10 +49,15 @@ #include "flow/IAsyncFile.h" #include "flow/genericactors.actor.h" #include "flow/Hash3.h" +#include "flow/xxhash.h" + +#include #include #include #include #include +#include +#include #include "flow/actorcompiler.h" // This must be the last #include. @@ -47,7 +65,7 @@ FDB_DEFINE_BOOLEAN_PARAM(IncrementalBackupOnly); FDB_DEFINE_BOOLEAN_PARAM(OnlyApplyMutationLogs); #define SevFRTestInfo SevVerbose -//#define SevFRTestInfo SevInfo +// #define SevFRTestInfo SevInfo static std::string boolToYesOrNo(bool val) { return val ? std::string("Yes") : std::string("No"); @@ -90,7 +108,7 @@ std::string secondsToTimeFormat(int64_t seconds) { return format("%lld second(s)", seconds); } -const Key FileBackupAgent::keyLastRestorable = LiteralStringRef("last_restorable"); +const Key FileBackupAgent::keyLastRestorable = "last_restorable"_sr; // For convenience typedef FileBackupAgent::ERestoreState ERestoreState; @@ -98,19 +116,19 @@ typedef FileBackupAgent::ERestoreState ERestoreState; StringRef FileBackupAgent::restoreStateText(ERestoreState id) { switch (id) { case ERestoreState::UNITIALIZED: - return LiteralStringRef("unitialized"); + return "unitialized"_sr; case ERestoreState::QUEUED: - return LiteralStringRef("queued"); + return "queued"_sr; case ERestoreState::STARTING: - return LiteralStringRef("starting"); + return "starting"_sr; case ERestoreState::RUNNING: - return LiteralStringRef("running"); + return "running"_sr; case ERestoreState::COMPLETED: - return LiteralStringRef("completed"); + return "completed"_sr; case ERestoreState::ABORTED: - return LiteralStringRef("aborted"); + return "aborted"_sr; default: - return LiteralStringRef("Unknown"); + return "Unknown"_sr; } } @@ -123,9 +141,9 @@ ACTOR Future> TagUidMap::getAll_impl(TagUidMap* tagsMa Reference tr, Snapshot snapshot) { state Key prefix = tagsMap->prefix; // Copying it here as tagsMap lifetime is not tied to this actor - TagMap::PairsType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6, snapshot)); + TagMap::RangeResultType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6, snapshot)); std::vector results; - for (auto& p : tagPairs) + for (auto& p : tagPairs.results) results.push_back(KeyBackedTag(p.first, prefix)); return results; } @@ -139,41 +157,37 @@ public: RestoreConfig(UID uid = UID()) : KeyBackedConfig(fileRestorePrefixRange.begin, uid) {} RestoreConfig(Reference task) : KeyBackedConfig(fileRestorePrefixRange.begin, task) {} - KeyBackedProperty stateEnum() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty stateEnum() { return configSpace.pack(__FUNCTION__sr); } Future stateText(Reference tr) { return map(stateEnum().getD(tr), [](ERestoreState s) -> StringRef { return FileBackupAgent::restoreStateText(s); }); } - KeyBackedProperty addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty onlyApplyMutationLogs() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty addPrefix() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty removePrefix() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); } // XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges - KeyBackedProperty restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty> restoreRanges() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } - KeyBackedProperty batchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty beginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty restoreVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty firstConsistentVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty restoreRange() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty> restoreRanges() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty batchFuture() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty beginVersion() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty restoreVersion() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty firstConsistentVersion() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty> sourceContainer() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty> sourceContainer() { return configSpace.pack(__FUNCTION__sr); } // Get the source container as a bare URL, without creating a container instance - KeyBackedProperty sourceContainerURL() { return configSpace.pack(LiteralStringRef("sourceContainer")); } + KeyBackedProperty sourceContainerURL() { return configSpace.pack("sourceContainer"_sr); } // Total bytes written by all log and range restore tasks. - KeyBackedBinaryValue bytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue bytesWritten() { return configSpace.pack(__FUNCTION__sr); } // File blocks that have had tasks created for them by the Dispatch task - KeyBackedBinaryValue filesBlocksDispatched() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue filesBlocksDispatched() { return configSpace.pack(__FUNCTION__sr); } // File blocks whose tasks have finished - KeyBackedBinaryValue fileBlocksFinished() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue fileBlocksFinished() { return configSpace.pack(__FUNCTION__sr); } // Total number of files in the fileMap - KeyBackedBinaryValue fileCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue fileCount() { return configSpace.pack(__FUNCTION__sr); } // Total number of file blocks in the fileMap - KeyBackedBinaryValue fileBlockCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue fileBlockCount() { return configSpace.pack(__FUNCTION__sr); } Future> getRestoreRangesOrDefault(Reference tr) { return getRestoreRangesOrDefault_impl(this, tr); @@ -200,13 +214,7 @@ public: Version endVersion{ ::invalidVersion }; // not meaningful for range files Tuple pack() const { - return Tuple() - .append(version) - .append(StringRef(fileName)) - .append(isRange) - .append(fileSize) - .append(blockSize) - .append(endVersion); + return Tuple::makeTuple(version, fileName, (int)isRange, fileSize, blockSize, endVersion); } static RestoreFile unpack(Tuple const& t) { RestoreFile r; @@ -222,7 +230,7 @@ public: }; typedef KeyBackedSet FileSetT; - FileSetT fileSet() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + FileSetT fileSet() { return configSpace.pack(__FUNCTION__sr); } Future isRunnable(Reference tr) { return map(stateEnum().getD(tr), [](ERestoreState s) -> bool { @@ -464,8 +472,445 @@ Value makePadding(int size) { return pad.substr(0, size); } +struct IRangeFileWriter { +public: + virtual Future padEnd(bool final) = 0; + + virtual Future writeKV(Key k, Value v) = 0; + + virtual Future writeKey(Key k) = 0; + + virtual Future finish() = 0; + + virtual ~IRangeFileWriter() {} +}; + +struct SnapshotFileBackupEncryptionKeys { + Reference textCipherKey; + EncryptCipherDomainName textDomain; + Reference headerCipherKey; + StringRef ivRef; +}; + // File Format handlers. -// Both Range and Log formats are designed to be readable starting at any 1MB boundary +// Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary +// so they can be read in parallel. +// +// Writer instances must be kept alive while any member actors are in progress. +// +// EncryptedRangeFileWriter must be used as follows: +// 1 - writeKey(key) the queried key range begin +// 2 - writeKV(k, v) each kv pair to restore +// 3 - writeKey(key) the queried key range end +// 4 - finish() +// +// EncryptedRangeFileWriter will insert the required padding, header, and extra +// end/begin keys around the 1MB boundaries as needed. +// +// Example: +// The range a-z is queries and returns c-j which covers 3 blocks across 2 tenants. +// The client code writes keys in this sequence: +// t1a t1c t1d t1e t1f t1g t2h t2i t2j t2z +// +// H = header P = padding a...z = keys v = value | = block boundary +// +// Encoded file: H t1a t1cv t1dv t1ev P | H t1e t1ev t1fv t1gv t2 P | H t2 t2hv t2iv t2jv t2z +// Decoded in blocks yields: +// Block 1: range [t1a, t1e) with kv pairs t1cv, t1dv +// Block 2: range [t1e, t2) with kv pairs t1ev, t1fv, t1gv +// Block 3: range [t2, t2z) with kv pairs t2hv, t2iv, t2jv +// +// NOTE: All blocks except for the final block will have one last +// value which will not be used. This isn't actually a waste since +// if the next KV pair wouldn't fit within the block after the value +// then the space after the final key to the next 1MB boundary would +// just be padding anyway. +// +// NOTE: For the EncryptedRangeFileWriter blocks will be split either on the BACKUP_RANGEFILE_BLOCK_SIZE boundary or +// when a new tenant id is encountered. If a block is split for crossing tenant boundaries then the last key will be +// truncated to just the tenant prefix and the value will be empty (to avoid having sensitive data of one tenant be +// encrypted with a key for a different tenant) +struct EncryptedRangeFileWriter : public IRangeFileWriter { + struct Options { + constexpr static FileIdentifier file_identifier = 3152016; + + // TODO: Compression is not currently supported so this should always be false + bool compressionEnabled = false; + + Options() {} + + template + void serialize(Ar& ar) { + serializer(ar, compressionEnabled); + } + }; + + EncryptedRangeFileWriter(Database cx, + Arena* arena, + Reference> tenantCache, + Reference file = Reference(), + int blockSize = 0, + Options options = Options()) + : cx(cx), arena(arena), tenantCache(tenantCache), file(file), blockSize(blockSize), blockEnd(0), + fileVersion(BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION), options(options) { + buffer = makeString(blockSize); + wPtr = mutateString(buffer); + } + + static void validateEncryptionHeader(Reference headerCipherKey, + Reference textCipherKey, + BlobCipherEncryptHeader& header) { + // Validate encryption header 'cipherHeader' details + if (!(header.cipherHeaderDetails.baseCipherId == headerCipherKey->getBaseCipherId() && + header.cipherHeaderDetails.encryptDomainId == headerCipherKey->getDomainId() && + header.cipherHeaderDetails.salt == headerCipherKey->getSalt())) { + TraceEvent(SevWarn, "EncryptionHeader_CipherHeaderMismatch") + .detail("HeaderDomainId", headerCipherKey->getDomainId()) + .detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId) + .detail("HeaderBaseCipherId", headerCipherKey->getBaseCipherId()) + .detail("ExpectedHeaderBaseCipherId", header.cipherHeaderDetails.baseCipherId) + .detail("HeaderSalt", headerCipherKey->getSalt()) + .detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt); + throw encrypt_header_metadata_mismatch(); + } + + // Validate encryption text 'cipherText' details sanity + if (!(header.cipherTextDetails.baseCipherId == textCipherKey->getBaseCipherId() && + header.cipherTextDetails.encryptDomainId == textCipherKey->getDomainId() && + header.cipherTextDetails.salt == textCipherKey->getSalt())) { + TraceEvent(SevWarn, "EncryptionHeader_CipherTextMismatch") + .detail("TextDomainId", textCipherKey->getDomainId()) + .detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId) + .detail("TextBaseCipherId", textCipherKey->getBaseCipherId()) + .detail("ExpectedTextBaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("TextSalt", textCipherKey->getSalt()) + .detail("ExpectedTextSalt", header.cipherTextDetails.salt); + throw encrypt_header_metadata_mismatch(); + } + } + + ACTOR static Future decryptImpl(Database cx, + StringRef headerS, + const uint8_t* dataP, + int64_t dataLen, + Arena* arena) { + Reference const> dbInfo = cx->clientInfo; + state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS); + TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP)); + ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid()); + validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header); + DecryptBlobCipherAes256Ctr decryptor( + cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header.iv, BlobCipherMetrics::BACKUP); + return decryptor.decrypt(dataP, dataLen, header, *arena)->toStringRef(); + } + + static Future decrypt(Database cx, + StringRef headerS, + const uint8_t* dataP, + int64_t dataLen, + Arena* arena) { + return decryptImpl(cx, headerS, dataP, dataLen, arena); + } + + ACTOR static Future> refreshKey(EncryptedRangeFileWriter* self, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName) { + Reference const> dbInfo = self->cx->clientInfo; + TextAndHeaderCipherKeys cipherKeys = + wait(getLatestEncryptCipherKeysForDomain(dbInfo, domainId, domainName, BlobCipherMetrics::BACKUP)); + return cipherKeys.cipherTextKey; + } + + ACTOR static Future encrypt(EncryptedRangeFileWriter* self) { + ASSERT(self->cipherKeys.headerCipherKey.isValid() && self->cipherKeys.textCipherKey.isValid()); + // Ensure that the keys we got are still valid before flushing the block + if (self->cipherKeys.headerCipherKey->isExpired() || self->cipherKeys.headerCipherKey->needsRefresh()) { + Reference cipherKey = + wait(refreshKey(self, self->cipherKeys.headerCipherKey->getDomainId(), FDB_ENCRYPT_HEADER_DOMAIN_NAME)); + self->cipherKeys.headerCipherKey = cipherKey; + } + if (self->cipherKeys.textCipherKey->isExpired() || self->cipherKeys.textCipherKey->needsRefresh()) { + Reference cipherKey = + wait(refreshKey(self, self->cipherKeys.textCipherKey->getDomainId(), self->cipherKeys.textDomain)); + self->cipherKeys.textCipherKey = cipherKey; + } + EncryptBlobCipherAes265Ctr encryptor(self->cipherKeys.textCipherKey, + self->cipherKeys.headerCipherKey, + self->cipherKeys.ivRef.begin(), + AES_256_IV_LENGTH, + ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, + BlobCipherMetrics::BACKUP); + Arena arena; + int64_t payloadSize = self->wPtr - self->dataPayloadStart; + auto encryptedData = encryptor.encrypt(self->dataPayloadStart, payloadSize, self->encryptHeader, arena); + + // re-write encrypted data to buffer + std::memcpy(self->dataPayloadStart, encryptedData->begin(), payloadSize); + return Void(); + } + + ACTOR static Future updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, + KeyRef key, + Reference> cache) { + state std::pair curTenantInfo = wait(getEncryptionDomainDetails(key, cache)); + state Reference const> dbInfo = self->cx->clientInfo; + + // Get text and header cipher key + TextAndHeaderCipherKeys textAndHeaderCipherKeys = wait(getLatestEncryptCipherKeysForDomain( + dbInfo, curTenantInfo.first, curTenantInfo.second, BlobCipherMetrics::BACKUP)); + self->cipherKeys.textCipherKey = textAndHeaderCipherKeys.cipherTextKey; + self->cipherKeys.textDomain = curTenantInfo.second; + self->cipherKeys.headerCipherKey = textAndHeaderCipherKeys.cipherHeaderKey; + + // Set ivRef + self->cipherKeys.ivRef = makeString(AES_256_IV_LENGTH, *self->arena); + deterministicRandom()->randomBytes(mutateString(self->cipherKeys.ivRef), AES_256_IV_LENGTH); + return Void(); + } + + // Returns the number of bytes that have been written to the buffer + static int64_t currentBufferSize(EncryptedRangeFileWriter* self) { return self->wPtr - self->buffer.begin(); } + + static int64_t expectedFileSize(EncryptedRangeFileWriter* self) { + // Return what has already been written to file plus the size of the current buffer + // which indicates how many bytes the file will contain once the buffer is written + return self->file->size() + currentBufferSize(self); + } + + static void copyToBuffer(EncryptedRangeFileWriter* self, const void* src, size_t size) { + if (size > 0) { + std::memcpy(self->wPtr, src, size); + self->wPtr += size; + ASSERT(currentBufferSize(self) <= self->blockSize); + } + } + + static void appendStringRefWithLenToBuffer(EncryptedRangeFileWriter* self, StringRef* s) { + // Append the string length followed by the string to the buffer + uint32_t lenBuf = bigEndian32((uint32_t)s->size()); + copyToBuffer(self, &lenBuf, sizeof(lenBuf)); + copyToBuffer(self, s->begin(), s->size()); + } + + static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; } + + ACTOR static Future> getEncryptionDomainDetailsImpl( + KeyRef key, + Reference> tenantCache) { + if (isSystemKey(key)) { + return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); + } + if (key.size() < TENANT_PREFIX_SIZE) { + return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } + KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE); + state int64_t tenantId = TenantMapEntry::prefixToId(tenantPrefix); + Optional> payload = wait(tenantCache->getById(tenantId)); + if (payload.present()) { + return std::make_pair(tenantId, payload.get().name); + } + return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); + } + + static Future> getEncryptionDomainDetails( + KeyRef key, + Reference> tenantCache) { + return getEncryptionDomainDetailsImpl(key, tenantCache); + } + + // Handles the first block and internal blocks. Ends current block if needed. + // The final flag is used in simulation to pad the file's final block to a whole block size + ACTOR static Future newBlock(EncryptedRangeFileWriter* self, + int bytesNeeded, + KeyRef lastKey, + bool writeValue, + bool final = false) { + // Write padding to finish current block if needed + int bytesLeft = self->blockEnd - expectedFileSize(self); + ASSERT(bytesLeft >= 0); + if (bytesLeft > 0) { + state Value paddingFFs = makePadding(bytesLeft); + copyToBuffer(self, paddingFFs.begin(), bytesLeft); + } + + if (expectedFileSize(self) > 0) { + // write buffer to file since block is finished + ASSERT(currentBufferSize(self) == self->blockSize); + wait(encrypt(self)); + wait(self->file->append(self->buffer.begin(), self->blockSize)); + + // reset write pointer to beginning of StringRef + self->wPtr = mutateString(self->buffer); + } + + if (final) { + ASSERT(g_network->isSimulated()); + return Void(); + } + + // Set new blockEnd + self->blockEnd += self->blockSize; + + // write Header + copyToBuffer(self, (uint8_t*)&self->fileVersion, sizeof(self->fileVersion)); + + // write options struct + Value serialized = + ObjectWriter::toValue(self->options, IncludeVersion(ProtocolVersion::withEncryptedSnapshotBackupFile())); + appendStringRefWithLenToBuffer(self, &serialized); + + // leave space for encryption header + self->encryptHeader = (BlobCipherEncryptHeader*)self->wPtr; + self->wPtr += BlobCipherEncryptHeader::headerSize; + self->dataPayloadStart = self->wPtr; + + // If this is NOT the first block then write duplicate stuff needed from last block + if (self->blockEnd > self->blockSize) { + appendStringRefWithLenToBuffer(self, &lastKey); + appendStringRefWithLenToBuffer(self, &self->lastKey); + if (writeValue) { + appendStringRefWithLenToBuffer(self, &self->lastValue); + } + } + + // There must now be room in the current block for bytesNeeded or the block size is too small + if (expectedFileSize(self) + bytesNeeded > self->blockEnd) { + throw backup_bad_block_size(); + } + + return Void(); + } + + Future padEnd(bool final) { + if (expectedFileSize(this) > 0) { + return newBlock(this, 0, StringRef(), true, final); + } + return Void(); + } + + // Ends the current block if necessary based on bytesNeeded. + ACTOR static Future newBlockIfNeeded(EncryptedRangeFileWriter* self, int bytesNeeded) { + if (expectedFileSize(self) + bytesNeeded > self->blockEnd) { + wait(newBlock(self, bytesNeeded, self->lastKey, true)); + } + return Void(); + } + + ACTOR static Future handleTenantBondary(EncryptedRangeFileWriter* self, + Key k, + Value v, + bool writeValue, + std::pair curKeyTenantInfo) { + state KeyRef endKey = k; + // If we are crossing a boundary with a key that has a tenant prefix then truncate it + if (curKeyTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && + curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { + endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE); + } + state ValueRef newValue = StringRef(); + self->lastKey = k; + self->lastValue = v; + appendStringRefWithLenToBuffer(self, &endKey); + appendStringRefWithLenToBuffer(self, &newValue); + wait(newBlock(self, 0, endKey, writeValue)); + wait(updateEncryptionKeysCtx(self, self->lastKey, self->tenantCache)); + return Void(); + } + + ACTOR static Future finishCurTenantBlockStartNewIfNeeded(EncryptedRangeFileWriter* self, + Key k, + Value v, + bool writeValue) { + // Don't want to start a new block if the current key or previous key is empty + if (self->lastKey.size() == 0 || k.size() == 0) { + return false; + } + state std::pair curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache)); + state std::pair prevKeyTenantInfo = + wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache)); + // crossing tenant boundaries so finish the current block using only the tenant prefix of the new key + if (curKeyTenantInfo.first != prevKeyTenantInfo.first) { + CODE_PROBE(true, "crossed tenant boundaries"); + wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo)); + return true; + } + return false; + } + + // Start a new block if needed, then write the key and value + ACTOR static Future writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) { + if (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid()) { + wait(updateEncryptionKeysCtx(self, k, self->tenantCache)); + } + state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); + wait(newBlockIfNeeded(self, toWrite)); + bool createdNewBlock = wait(finishCurTenantBlockStartNewIfNeeded(self, k, v, true)); + if (createdNewBlock) { + return Void(); + } + appendStringRefWithLenToBuffer(self, &k); + appendStringRefWithLenToBuffer(self, &v); + self->lastKey = k; + self->lastValue = v; + return Void(); + } + + Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } + + // Write begin key or end key. + ACTOR static Future writeKey_impl(EncryptedRangeFileWriter* self, Key k) { + // TODO (Nim): Is it possible to write empty begin and end keys? + if (k.size() > 0 && + (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid())) { + wait(updateEncryptionKeysCtx(self, k, self->tenantCache)); + } + + // Need to account for extra "empty" value being written in the case of crossing tenant boundaries + int toWrite = sizeof(uint32_t) + k.size() + sizeof(uint32_t); + wait(newBlockIfNeeded(self, toWrite)); + bool createdNewBlock = wait(finishCurTenantBlockStartNewIfNeeded(self, k, StringRef(), false)); + if (createdNewBlock) { + return Void(); + } + appendStringRefWithLenToBuffer(self, &k); + self->lastKey = k; + return Void(); + } + + Future writeKey(Key k) { return writeKey_impl(this, k); } + + ACTOR static Future finish_impl(EncryptedRangeFileWriter* self) { + // Write any outstanding bytes to the file + if (currentBufferSize(self) > 0) { + wait(encrypt(self)); + wait(self->file->append(self->buffer.begin(), currentBufferSize(self))); + } + return Void(); + } + + Future finish() { return finish_impl(this); } + + Database cx; + Arena* arena; + Reference> tenantCache; + Reference file; + int blockSize; + +private: + Standalone buffer; + uint8_t* wPtr; + BlobCipherEncryptHeader* encryptHeader; + uint8_t* dataPayloadStart; + int64_t blockEnd; + uint32_t fileVersion; + Options options; + Key lastKey; + Key lastValue; + SnapshotFileBackupEncryptionKeys cipherKeys; +}; + +// File Format handlers. +// Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary // so they can be read in parallel. // // Writer instances must be kept alive while any member actors are in progress. @@ -474,6 +919,7 @@ Value makePadding(int size) { // 1 - writeKey(key) the queried key range begin // 2 - writeKV(k, v) each kv pair to restore // 3 - writeKey(key) the queried key range end +// 4 - finish() // // RangeFileWriter will insert the required padding, header, and extra // end/begin keys around the 1MB boundaries as needed. @@ -496,7 +942,7 @@ Value makePadding(int size) { // if the next KV pair wouldn't fit within the block after the value // then the space after the final key to the next 1MB boundary would // just be padding anyway. -struct RangeFileWriter { +struct RangeFileWriter : public IRangeFileWriter { RangeFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {} @@ -536,10 +982,10 @@ struct RangeFileWriter { } // Used in simulation only to create backup file sizes which are an integer multiple of the block size - Future padEnd() { + Future padEnd(bool final) { ASSERT(g_network->isSimulated()); if (file->size() > 0) { - return newBlock(this, 0, true); + return newBlock(this, 0, final); } return Void(); } @@ -574,6 +1020,8 @@ struct RangeFileWriter { Future writeKey(Key k) { return writeKey_impl(this, k); } + Future finish() { return Void(); } + Reference file; int blockSize; @@ -584,9 +1032,49 @@ private: Key lastValue; }; +ACTOR static Future decodeKVPairs(StringRefReader* reader, + Standalone>* results, + bool encryptedBlock, + Optional cx, + Reference> tenantCache) { + // Read begin key, if this fails then block was invalid. + state uint32_t kLen = reader->consumeNetworkUInt32(); + state const uint8_t* k = reader->consume(kLen); + results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + + // Read kv pairs and end key + while (1) { + // Read a key. + kLen = reader->consumeNetworkUInt32(); + k = reader->consume(kLen); + + // If eof reached or first value len byte is 0xFF then a valid block end was reached. + if (reader->eof() || *reader->rptr == 0xFF) { + results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + break; + } + + // Read a value, which must exist or the block is invalid + uint32_t vLen = reader->consumeNetworkUInt32(); + const uint8_t* v = reader->consume(vLen); + results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); + + // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. + if (reader->eof() || *reader->rptr == 0xFF) + break; + } + + // Make sure any remaining bytes in the block are 0xFF + for (auto b : reader->remainder()) + if (b != 0xFF) + throw restore_corrupted_data_padding(); + return Void(); +} + ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, - int len) { + int len, + Optional cx) { state Standalone buf = makeString(len); int rLen = wait(file->read(mutateString(buf), len, offset)); if (rLen != len) @@ -594,48 +1082,44 @@ ACTOR Future>> decodeRangeFileBlock(Reference< simulateBlobFailure(); - Standalone> results({}, buf.arena()); + state Standalone> results({}, buf.arena()); state StringRefReader reader(buf, restore_corrupted_data()); + state Arena arena; try { - // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION - if (reader.consume() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION) + // Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION or + // BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION + int32_t file_version = reader.consume(); + if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) { + wait(decodeKVPairs(&reader, &results, false, cx, Reference>())); + } else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { + CODE_PROBE(true, "decoding encrypted block"); + ASSERT(cx.present()); + // decode options struct + uint32_t optionsLen = reader.consumeNetworkUInt32(); + const uint8_t* o = reader.consume(optionsLen); + StringRef optionsStringRef = StringRef(o, optionsLen); + EncryptedRangeFileWriter::Options options = + ObjectReader::fromStringRef(optionsStringRef, IncludeVersion()); + ASSERT(!options.compressionEnabled); + + // read encryption header + const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize); + StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize); + const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize; + // calculate the total bytes read up to (and including) the header + int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize; + // get the size of the encrypted payload and decrypt it + int64_t dataLen = len - bytesRead; + StringRef decryptedData = + wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena())); + reader = StringRefReader(decryptedData, restore_corrupted_data()); + Reference> tenantCache = makeReference>(cx.get()); + wait(decodeKVPairs(&reader, &results, true, cx, tenantCache)); + } else { throw restore_unsupported_file_version(); - - // Read begin key, if this fails then block was invalid. - uint32_t kLen = reader.consumeNetworkUInt32(); - const uint8_t* k = reader.consume(kLen); - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); - - // Read kv pairs and end key - while (1) { - // Read a key. - kLen = reader.consumeNetworkUInt32(); - k = reader.consume(kLen); - - // If eof reached or first value len byte is 0xFF then a valid block end was reached. - if (reader.eof() || *reader.rptr == 0xFF) { - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); - break; - } - - // Read a value, which must exist or the block is invalid - uint32_t vLen = reader.consumeNetworkUInt32(); - const uint8_t* v = reader.consume(vLen); - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); - - // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. - if (reader.eof() || *reader.rptr == 0xFF) - break; } - - // Make sure any remaining bytes in the block are 0xFF - for (auto b : reader.remainder()) - if (b != 0xFF) - throw restore_corrupted_data_padding(); - return results; - } catch (Error& e) { TraceEvent(SevWarn, "FileRestoreDecodeRangeFileBlockFailed") .error(e) @@ -781,8 +1265,7 @@ ACTOR static Future abortFiveZeroBackup(FileBackupAgent* backupAgent, state Subspace statusSpace = backupAgent->subspace.get(BackupAgentBase::keyStates).get(uid.toString()); state Subspace globalConfig = backupAgent->subspace.get(BackupAgentBase::keyConfig).get(uid.toString()); - state Subspace newConfigSpace = - uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(fileBackupPrefixRange.begin), uid); + state Subspace newConfigSpace = uidPrefixKey("uid->config/"_sr.withPrefix(fileBackupPrefixRange.begin), uid); Optional statusStr = wait(tr->get(statusSpace.pack(FileBackupAgent::keyStateStatus))); state EBackupState status = @@ -822,8 +1305,6 @@ struct AbortFiveZeroBackupTask : TaskFuncBase { state FileBackupAgent backupAgent; state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString(); - TEST(true); // Canceling old backup task - TraceEvent(SevInfo, "FileBackupCancelOldTask") .detail("Task", task->params[Task::reservedTaskParamKeyType]) .detail("TagName", tagName); @@ -853,7 +1334,7 @@ struct AbortFiveZeroBackupTask : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef AbortFiveZeroBackupTask::name = LiteralStringRef("abort_legacy_backup"); +StringRef AbortFiveZeroBackupTask::name = "abort_legacy_backup"_sr; REGISTER_TASKFUNC(AbortFiveZeroBackupTask); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_diff_logs); REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_log_range); @@ -908,8 +1389,6 @@ struct AbortFiveOneBackupTask : TaskFuncBase { state BackupConfig config(task); state std::string tagName = wait(config.tag().getOrThrow(tr)); - TEST(true); // Canceling 5.1 backup task - TraceEvent(SevInfo, "FileBackupCancelFiveOneTask") .detail("Task", task->params[Task::reservedTaskParamKeyType]) .detail("TagName", tagName); @@ -939,7 +1418,7 @@ struct AbortFiveOneBackupTask : TaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef AbortFiveOneBackupTask::name = LiteralStringRef("abort_legacy_backup_5.2"); +StringRef AbortFiveOneBackupTask::name = "abort_legacy_backup_5.2"_sr; REGISTER_TASKFUNC(AbortFiveOneBackupTask); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_range); REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_ranges); @@ -978,7 +1457,7 @@ ACTOR static Future addBackupTask(StringRef name, } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } // Clears the backup ID from "backupStartedKey" to pause backup workers. @@ -1050,9 +1529,9 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam beginKey() { return LiteralStringRef(__FUNCTION__); } - static TaskParam endKey() { return LiteralStringRef(__FUNCTION__); } - static TaskParam addBackupRangeTasks() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginKey() { return __FUNCTION__sr; } + static TaskParam endKey() { return __FUNCTION__sr; } + static TaskParam addBackupRangeTasks() { return __FUNCTION__sr; } } Params; std::string toString(Reference task) const override { @@ -1077,8 +1556,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; - // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress durable. - // This means: + // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress + // durable. This means: // - increment the backup config's range bytes written // - update the range file map // - update the task begin key @@ -1186,8 +1665,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { .detail("EndKey", Params.endKey().get(task).printable()) .detail("TaskKey", task->key.printable()); - // When a key range task saves the last chunk of progress and then the executor dies, when the task continues - // its beginKey and endKey will be equal but there is no work to be done. + // When a key range task saves the last chunk of progress and then the executor dies, when the task + // continues its beginKey and endKey will be equal but there is no work to be done. if (beginKey == endKey) return Void(); @@ -1200,8 +1679,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { } // Read everything from beginKey to endKey, write it to an output file, run the output file processor, and - // then set on_done. If we are still writing after X seconds, end the output file and insert a new backup_range - // task for the remainder. + // then set on_done. If we are still writing after X seconds, end the output file and insert a new + // backup_range task for the remainder. state Reference outFile; state Version outVersion = invalidVersion; state Key lastKey; @@ -1216,11 +1695,13 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { Terminator::True, AccessSystemKeys::True, LockAware::True); - state RangeFileWriter rangeFile; + state std::unique_ptr rangeFile; state BackupConfig backup(task); + state Arena arena; + state Reference> tenantCache = makeReference>(cx); - // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but if - // bc is false then clearly the backup is no longer in progress + // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but + // if bc is false then clearly the backup is no longer in progress state Reference bc = wait(backup.backupContainer().getD(cx.getReference())); if (!bc) { return Void(); @@ -1228,6 +1709,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { state bool done = false; state int64_t nrKeys = 0; + state bool encryptionEnabled = false; loop { state RangeResultWithVersion values; @@ -1242,17 +1724,20 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { throw; } - // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish it. + // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish + // it. if (values.second != outVersion || done) { if (outFile) { - TEST(outVersion != invalidVersion); // Backup range task wrote multiple versions + CODE_PROBE(outVersion != invalidVersion, "Backup range task wrote multiple versions"); state Key nextKey = done ? endKey : keyAfter(lastKey); - wait(rangeFile.writeKey(nextKey)); + wait(rangeFile->writeKey(nextKey)); if (BUGGIFY) { - wait(rangeFile.padEnd()); + wait(rangeFile->padEnd(true)); } + wait(rangeFile->finish()); + bool usedFile = wait( finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion)); TraceEvent("FileBackupWroteRangeFile") @@ -1275,8 +1760,8 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { // Start writing a new file after verifying this task should keep running as of a new read version // (which must be >= outVersion) outVersion = values.second; - // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so 250k - // conservatively. + // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so + // 250k conservatively. state int blockSize = BUGGIFY ? deterministicRandom()->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; state Version snapshotBeginVersion; @@ -1290,6 +1775,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { wait(taskBucket->keepRunning(tr, task) && storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) && + storeOrThrow(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) && store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr))); break; @@ -1302,16 +1788,22 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); outFile = f; + encryptionEnabled = encryptionEnabled && cx->clientInfo->get().isEncryptionEnabled; // Initialize range file writer and write begin key - rangeFile = RangeFileWriter(outFile, blockSize); - wait(rangeFile.writeKey(beginKey)); + if (encryptionEnabled) { + CODE_PROBE(true, "using encrypted snapshot file writer"); + rangeFile = std::make_unique(cx, &arena, tenantCache, outFile, blockSize); + } else { + rangeFile = std::make_unique(outFile, blockSize); + } + wait(rangeFile->writeKey(beginKey)); } // write kvData to file, update lastKey and key count if (values.first.size() != 0) { state size_t i = 0; for (; i < values.first.size(); ++i) { - wait(rangeFile.writeKV(values.first[i].key, values.first[i].value)); + wait(rangeFile->writeKV(values.first[i].key, values.first[i].value)); } lastKey = values.first.back().key; nrKeys += values.first.size(); @@ -1376,7 +1868,6 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { Reference futureBucket, Reference task) { state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - if (Params.addBackupRangeTasks().get(task)) { wait(startBackupRangeInternal(tr, taskBucket, futureBucket, task, taskFuture)); } else { @@ -1395,7 +1886,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase { return Void(); } }; -StringRef BackupRangeTaskFunc::name = LiteralStringRef("file_backup_write_range_5.2"); +StringRef BackupRangeTaskFunc::name = "file_backup_write_range_5.2"_sr; REGISTER_TASKFUNC(BackupRangeTaskFunc); struct BackupSnapshotDispatchTask : BackupTaskFuncBase { @@ -1404,11 +1895,11 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { static struct { // Set by Execute, used by Finish - static TaskParam shardsBehind() { return LiteralStringRef(__FUNCTION__); } + static TaskParam shardsBehind() { return __FUNCTION__sr; } // Set by Execute, used by Finish - static TaskParam snapshotFinished() { return LiteralStringRef(__FUNCTION__); } + static TaskParam snapshotFinished() { return __FUNCTION__sr; } // Set by Execute, used by Finish - static TaskParam nextDispatchVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam nextDispatchVersion() { return __FUNCTION__sr; } } Params; StringRef getName() const override { return name; }; @@ -1461,12 +1952,12 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state double startTime = timer(); state Reference tr(new ReadYourWritesTransaction(cx)); - // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN which - // will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which are not - // yet finished to enable efficiently finding random database shards which are not done. + // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN + // which will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which + // are not yet finished to enable efficiently finding random database shards which are not done. state int notDoneSequence = NOT_DONE_MIN; - state KeyRangeMap shardMap(notDoneSequence++, normalKeys.end); - state Key beginKey = normalKeys.begin; + state KeyRangeMap shardMap(notDoneSequence++); + state Key beginKey = allKeys.begin; // Read all shard boundaries and add them to the map loop { @@ -1475,7 +1966,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future>> shardBoundaries = - getBlockOfShards(tr, beginKey, normalKeys.end, CLIENT_KNOBS->TOO_MANY); + getBlockOfShards(tr, beginKey, allKeys.end, CLIENT_KNOBS->TOO_MANY); wait(success(shardBoundaries) && taskBucket->keepRunning(tr, task)); if (shardBoundaries.get().size() == 0) @@ -1520,7 +2011,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) && store(recentReadVersion, tr->getReadVersion()) && taskBucket->keepRunning(tr, task)); - // If the snapshot batch future key does not exist, this is the first execution of this dispatch task so + // If the snapshot batch future key does not exist, this is the first execution of this dispatch + // task so // - create and set the snapshot batch future key // - initialize the batch size to 0 // - initialize the target snapshot end version if it is not yet set @@ -1532,7 +2024,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { config.snapshotBatchSize().set(tr, snapshotBatchSize.get()); // The dispatch of this batch can take multiple separate executions if the executor fails - // so store a completion key for the dispatch finish() to set when dispatching the batch is done. + // so store a completion key for the dispatch finish() to set when dispatching the batch is + // done. state TaskCompletionKey dispatchCompletionKey = TaskCompletionKey::joinWith(snapshotBatchFuture); // this is a bad hack - but flow doesn't work well with lambda functions and caputring // state variables... @@ -1558,24 +2051,28 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { // Read all dispatched ranges state std::vector> dispatchBoundaries; tr->reset(); - beginKey = normalKeys.begin; + beginKey = allKeys.begin; loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Future>> bounds = config.snapshotRangeDispatchMap().getRange( - tr, beginKey, keyAfter(normalKeys.end), CLIENT_KNOBS->TOO_MANY); + state Future bounds = + config.snapshotRangeDispatchMap().getRange( + tr, beginKey, keyAfter(allKeys.end), CLIENT_KNOBS->TOO_MANY); wait(success(bounds) && taskBucket->keepRunning(tr, task) && store(recentReadVersion, tr->getReadVersion())); - if (bounds.get().empty()) + if (!bounds.get().results.empty()) { + dispatchBoundaries.reserve(dispatchBoundaries.size() + bounds.get().results.size()); + dispatchBoundaries.insert( + dispatchBoundaries.end(), bounds.get().results.begin(), bounds.get().results.end()); + } + + if (!bounds.get().more) break; - dispatchBoundaries.reserve(dispatchBoundaries.size() + bounds.get().size()); - dispatchBoundaries.insert(dispatchBoundaries.end(), bounds.get().begin(), bounds.get().end()); - - beginKey = keyAfter(bounds.get().back().first); + beginKey = keyAfter(bounds.get().results.back().first); tr->reset(); } catch (Error& e) { wait(tr->onError(e)); @@ -1603,8 +2100,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { // If this was the end of a dispatched range if (!boundary.second) { - // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range to - // DONE. + // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range + // to DONE. RangeMap::Ranges shardRanges = shardMap.modify(KeyRangeRef(lastKey, boundary.first)); iShard = shardRanges.begin(); @@ -1625,7 +2122,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { // Set anything outside the backup ranges to SKIP. We can use insert() here instead of modify() // because it's OK to delete shard boundaries in the skipped ranges. if (backupRanges.size() > 0) { - shardMap.insert(KeyRangeRef(normalKeys.begin, backupRanges.front().begin), SKIP); + shardMap.insert(KeyRangeRef(allKeys.begin, backupRanges.front().begin), SKIP); wait(yield()); for (i = 0; i < backupRanges.size() - 1; ++i) { @@ -1633,7 +2130,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { wait(yield()); } - shardMap.insert(KeyRangeRef(backupRanges.back().end, normalKeys.end), SKIP); + shardMap.insert(KeyRangeRef(backupRanges.back().end, allKeys.end), SKIP); wait(yield()); } @@ -1654,7 +2151,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { } // Coalesce the shard map to make random selection below more efficient. - shardMap.coalesce(normalKeys); + shardMap.coalesce(allKeys); wait(yield()); // In this context "all" refers to all of the shards relevant for this particular backup @@ -1686,10 +2183,10 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * CLIENT_KNOBS->BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC; - // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just use - // the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this dispatch will - // be scheduled at a random time between recentReadVersion and nextDispatchVersion, - // so nextDispatchVersion shouldn't be less than recentReadVersion. + // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just + // use the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this + // dispatch will be scheduled at a random time between recentReadVersion and nextDispatchVersion, so + // nextDispatchVersion shouldn't be less than recentReadVersion. if (nextDispatchVersion > snapshotTargetEndVersion) nextDispatchVersion = std::max(recentReadVersion, snapshotTargetEndVersion); @@ -1709,12 +2206,12 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state int countShardsToDispatch = std::max(0, countExpectedShardsDone - countShardsDone); // Calculate the number of shards that would have been dispatched by a normal (on-schedule) - // BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the current - // snapshot. + // BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the + // current snapshot. int64_t dispatchWindow = nextDispatchVersion - recentReadVersion; - // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all shards - // are considered late + // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all + // shards are considered late int countShardsExpectedPerNormalWindow; if (snapshotScheduledVersionInterval == 0) { countShardsExpectedPerNormalWindow = 0; @@ -1725,8 +2222,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { (double(dispatchWindow) / snapshotScheduledVersionInterval) * countAllShards; } - // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are being - // dispatched, if any. + // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are + // being dispatched, if any. int countShardsBehind = std::max(0, countShardsToDispatch + snapshotBatchSize.get() - countShardsExpectedPerNormalWindow); Params.shardsBehind().set(task, countShardsBehind); @@ -1804,8 +2301,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { wait(store(snapshotBatchSize.get(), config.snapshotBatchSize().getOrThrow(tr)) && waitForAll(beginReads) && waitForAll(endReads) && taskBucket->keepRunning(tr, task)); - // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction is - // already done. + // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction + // is already done. if (snapshotBatchSize.get() == newBatchSize) { break; } else { @@ -1843,8 +2340,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { } Version scheduledVersion = invalidVersion; - // If the next dispatch version is in the future, choose a random version at which to start - // the new task. + // If the next dispatch version is in the future, choose a random version at which to + // start the new task. if (nextDispatchVersion > recentReadVersion) scheduledVersion = recentReadVersion + deterministicRandom()->random01() * (nextDispatchVersion - recentReadVersion); @@ -1870,8 +2367,8 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { .detail("BeginKey", range.begin.printable()) .detail("EndKey", range.end.printable()); } else { - // This shouldn't happen because if the transaction was already done or if another execution - // of this task is making progress it should have been detected above. + // This shouldn't happen because if the transaction was already done or if another + // execution of this task is making progress it should have been detected above. ASSERT(false); } } @@ -1903,9 +2400,9 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { } // This function is just a wrapper for BackupSnapshotManifest::addTask() which is defined below. - // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep their - // execute and finish phases defined together inside their class definitions this wrapper is declared here but - // defined after BackupSnapshotManifest is defined. + // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep + // their execute and finish phases defined together inside their class definitions this wrapper is declared here + // but defined after BackupSnapshotManifest is defined. static Future addSnapshotManifestTask(Reference tr, Reference taskBucket, Reference parentTask, @@ -1938,9 +2435,9 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { state Reference snapshotFinishedFuture = task->getDoneFuture(futureBucket); - // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another snapshot - // dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done key, passed to - // the current task, is also passed on. + // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another + // snapshot dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done + // key, passed to the current task, is also passed on. if (Params.snapshotFinished().getOrDefault(task, false)) { wait(success(addSnapshotManifestTask( tr, taskBucket, task, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture))); @@ -1962,7 +2459,7 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase { return Void(); } }; -StringRef BackupSnapshotDispatchTask::name = LiteralStringRef("file_backup_dispatch_ranges_5.2"); +StringRef BackupSnapshotDispatchTask::name = "file_backup_dispatch_ranges_5.2"_sr; REGISTER_TASKFUNC(BackupSnapshotDispatchTask); struct BackupLogRangeTaskFunc : BackupTaskFuncBase { @@ -1970,10 +2467,10 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam addBackupLogRangeTasks() { return LiteralStringRef(__FUNCTION__); } - static TaskParam fileSize() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam addBackupLogRangeTasks() { return __FUNCTION__sr; } + static TaskParam fileSize() { return __FUNCTION__sr; } + static TaskParam beginVersion() { return __FUNCTION__sr; } + static TaskParam endVersion() { return __FUNCTION__sr; } } Params; StringRef getName() const override { return name; }; @@ -2034,10 +2531,10 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); // Get the set of key ranges that hold mutations for (beginVersion, endVersion). They will be queried in - // parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if that - // limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the finish() - // step to break up the (beginVersion, endVersion) range into smaller intervals which are then processed by - // individual BackupLogRangeTasks. + // parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if + // that limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the + // finish() step to break up the (beginVersion, endVersion) range into smaller intervals which are then + // processed by individual BackupLogRangeTasks. state Standalone> ranges = getLogRanges(beginVersion, endVersion, destUidValue); if (ranges.size() > CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) { Params.addBackupLogRangeTasks().set(task, true); @@ -2051,9 +2548,9 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { state Reference outFile = wait(bc->writeLogFile(beginVersion, endVersion, blockSize)); state LogFileWriter logFile(outFile, blockSize); - // Query all key ranges covering (beginVersion, endVersion) in parallel, writing their results to the results - // promise stream as they are received. Note that this means the records read from the results stream are not - // likely to be in increasing Version order. + // Query all key ranges covering (beginVersion, endVersion) in parallel, writing their results to the + // results promise stream as they are received. Note that this means the records read from the results + // stream are not likely to be in increasing Version order. state PromiseStream results; state std::vector> rc; @@ -2062,7 +2559,7 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { readCommitted(cx, results, lock, range, Terminator::False, AccessSystemKeys::True, LockAware::True)); } - state Future sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr const& result) { + state Future sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr const& result) mutable { if (result.isError()) results.sendError(result.getError()); else @@ -2201,7 +2698,7 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase { } }; -StringRef BackupLogRangeTaskFunc::name = LiteralStringRef("file_backup_write_logs_5.2"); +StringRef BackupLogRangeTaskFunc::name = "file_backup_write_logs_5.2"_sr; REGISTER_TASKFUNC(BackupLogRangeTaskFunc); // This task stopped being used in 6.2, however the code remains here to handle upgrades. @@ -2211,9 +2708,9 @@ struct EraseLogRangeTaskFunc : BackupTaskFuncBase { StringRef getName() const override { return name; }; static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam destUidValue() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return __FUNCTION__sr; } + static TaskParam endVersion() { return __FUNCTION__sr; } + static TaskParam destUidValue() { return __FUNCTION__sr; } } Params; ACTOR static Future addTask(Reference tr, @@ -2232,7 +2729,8 @@ struct EraseLogRangeTaskFunc : BackupTaskFuncBase { BackupConfig(logUid), waitFor, [=](Reference task) { - Params.beginVersion().set(task, 1); // FIXME: remove in 6.X, only needed for 5.2 backward compatibility + Params.beginVersion().set(task, + 1); // FIXME: remove in 6.X, only needed for 5.2 backward compatibility Params.endVersion().set(task, endVersion); Params.destUidValue().set(task, destUidValue); }, @@ -2276,7 +2774,7 @@ struct EraseLogRangeTaskFunc : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("file_backup_erase_logs_5.2"); +StringRef EraseLogRangeTaskFunc::name = "file_backup_erase_logs_5.2"_sr; REGISTER_TASKFUNC(EraseLogRangeTaskFunc); struct BackupLogsDispatchTask : BackupTaskFuncBase { @@ -2284,8 +2782,8 @@ struct BackupLogsDispatchTask : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam prevBeginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam prevBeginVersion() { return __FUNCTION__sr; } + static TaskParam beginVersion() { return __FUNCTION__sr; } } Params; ACTOR static Future _finish(Reference tr, @@ -2354,8 +2852,8 @@ struct BackupLogsDispatchTask : BackupTaskFuncBase { state int priority = latestSnapshotEndVersion.present() ? 1 : 0; if (!partitionedLog.present() || !partitionedLog.get()) { - // Add the initial log range task to read/copy the mutations and the next logs dispatch task which will run - // after this batch is done + // Add the initial log range task to read/copy the mutations and the next logs dispatch task which will + // run after this batch is done wait(success(BackupLogRangeTaskFunc::addTask(tr, taskBucket, task, @@ -2446,7 +2944,7 @@ struct BackupLogsDispatchTask : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef BackupLogsDispatchTask::name = LiteralStringRef("file_backup_dispatch_logs_5.2"); +StringRef BackupLogsDispatchTask::name = "file_backup_dispatch_logs_5.2"_sr; REGISTER_TASKFUNC(BackupLogsDispatchTask); struct FileBackupFinishedTask : BackupTaskFuncBase { @@ -2506,14 +3004,14 @@ struct FileBackupFinishedTask : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef FileBackupFinishedTask::name = LiteralStringRef("file_backup_finished_5.2"); +StringRef FileBackupFinishedTask::name = "file_backup_finished_5.2"_sr; REGISTER_TASKFUNC(FileBackupFinishedTask); struct BackupSnapshotManifest : BackupTaskFuncBase { static StringRef name; static constexpr uint32_t version = 1; static struct { - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam endVersion() { return __FUNCTION__sr; } } Params; ACTOR static Future _execute(Database cx, @@ -2525,8 +3023,8 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { state Reference tr(new ReadYourWritesTransaction(cx)); - // Read the entire range file map into memory, then walk it backwards from its last entry to produce a list of - // non overlapping key range files + // Read the entire range file map into memory, then walk it backwards from its last entry to produce a list + // of non overlapping key range files state std::map localmap; state Key startKey; state int batchSize = BUGGIFY ? 1 : 1000000; @@ -2543,17 +3041,17 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { wait(store(bc, config.backupContainer().getOrThrow(tr))); } - BackupConfig::RangeFileMapT::PairsType rangeresults = + BackupConfig::RangeFileMapT::RangeResultType rangeresults = wait(config.snapshotRangeFileMap().getRange(tr, startKey, {}, batchSize)); - for (auto& p : rangeresults) { + for (auto& p : rangeresults.results) { localmap.insert(p); } - if (rangeresults.size() < batchSize) + if (!rangeresults.more) break; - startKey = keyAfter(rangeresults.back().first); + startKey = keyAfter(rangeresults.results.back().first); tr->reset(); } catch (Error& e) { wait(tr->onError(e)); @@ -2590,10 +3088,11 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { totalBytes += r.fileSize; // Jump to file that either ends where this file begins or has the greatest end that is less than - // the begin of this file. In other words find the map key that is <= begin of this file. To do this - // find the first end strictly greater than begin and then back up one. + // the begin of this file. In other words find the map key that is <= begin of this file. To do + // this find the first end strictly greater than begin and then back up one. i = localmap.upper_bound(i->second.begin); - // If we get begin then we're done, there are no more ranges that end at or before the last file's begin + // If we get begin then we're done, there are no more ranges that end at or before the last file's + // begin if (i == localmap.begin()) break; --i; @@ -2695,7 +3194,7 @@ struct BackupSnapshotManifest : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef BackupSnapshotManifest::name = LiteralStringRef("file_backup_write_snapshot_manifest_5.2"); +StringRef BackupSnapshotManifest::name = "file_backup_write_snapshot_manifest_5.2"_sr; REGISTER_TASKFUNC(BackupSnapshotManifest); Future BackupSnapshotDispatchTask::addSnapshotManifestTask(Reference tr, @@ -2711,7 +3210,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return __FUNCTION__sr; } } Params; ACTOR static Future _execute(Database cx, @@ -2841,8 +3340,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { wait(success(BackupLogsDispatchTask::addTask( tr, taskBucket, task, 1, 0, beginVersion, TaskCompletionKey::joinWith(backupFinished)))); - // If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then the - // following task will clean up and set the completed state. + // If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then + // the following task will clean up and set the completed state. wait(success( FileBackupFinishedTask::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal(), backupFinished))); @@ -2881,7 +3380,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef StartFullBackupTaskFunc::name = LiteralStringRef("file_backup_start_5.2"); +StringRef StartFullBackupTaskFunc::name = "file_backup_start_5.2"_sr; REGISTER_TASKFUNC(StartFullBackupTaskFunc); struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { @@ -2897,13 +3396,14 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { // Clear the file map now since it could be huge. restore.fileSet().clear(tr); - // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for any - // restore operation the ranges to restore must be within the backed up ranges, otherwise from the restore - // perspective it will appear that some key ranges were missing and so the backup set is incomplete and the - // restore has failed. This validation cannot be done currently because Restore only supports a single restore - // range but backups can have many ranges. + // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for + // any restore operation the ranges to restore must be within the backed up ranges, otherwise from the + // restore perspective it will appear that some key ranges were missing and so the backup set is incomplete + // and the restore has failed. This validation cannot be done currently because Restore only supports a + // single restore range but backups can have many ranges. - // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored version. + // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored + // version. restore.clearApplyMutationsKeys(tr); wait(taskBucket->finish(tr, task)); @@ -2928,7 +3428,7 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } static StringRef name; @@ -2948,14 +3448,14 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreCompleteTaskFunc::name = LiteralStringRef("restore_complete"); +StringRef RestoreCompleteTaskFunc::name = "restore_complete"_sr; REGISTER_TASKFUNC(RestoreCompleteTaskFunc); struct RestoreFileTaskFuncBase : RestoreTaskFuncBase { struct InputParams { - static TaskParam inputFile() { return LiteralStringRef(__FUNCTION__); } - static TaskParam readOffset() { return LiteralStringRef(__FUNCTION__); } - static TaskParam readLen() { return LiteralStringRef(__FUNCTION__); } + static TaskParam inputFile() { return __FUNCTION__sr; } + static TaskParam readOffset() { return __FUNCTION__sr; } + static TaskParam readLen() { return __FUNCTION__sr; } } Params; std::string toString(Reference task) const override { @@ -2970,8 +3470,8 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { static struct : InputParams { // The range of data that the (possibly empty) data represented, which is set if it intersects the target // restore range - static TaskParam originalFileRange() { return LiteralStringRef(__FUNCTION__); } - static TaskParam> originalFileRanges() { return LiteralStringRef(__FUNCTION__); } + static TaskParam originalFileRange() { return __FUNCTION__sr; } + static TaskParam> originalFileRanges() { return __FUNCTION__sr; } static std::vector getOriginalFileRanges(Reference task) { if (originalFileRanges().exists(task)) { @@ -3040,7 +3540,8 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { } state Reference inFile = wait(bc.get()->readFile(rangeFile.fileName)); - state Standalone> blockData = wait(decodeRangeFileBlock(inFile, readOffset, readLen)); + state Standalone> blockData = + wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx)); // First and last key are the range for this file state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key); @@ -3067,16 +3568,16 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { state VectorRef data = blockData.slice(rangeStart, rangeEnd); // Shrink file range to be entirely within restoreRange and translate it to the new prefix - // First, use the untranslated file range to create the shrunk original file range which must be used in the - // kv range version map for applying mutations + // First, use the untranslated file range to create the shrunk original file range which must be used in + // the kv range version map for applying mutations state KeyRange originalFileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end)); originalFileRanges.push_back(originalFileRange); // Now shrink and translate fileRange Key fileEnd = std::min(fileRange.end, restoreRange.end); - if (fileEnd == (removePrefix.get() == StringRef() ? normalKeys.end : strinc(removePrefix.get()))) { - fileEnd = addPrefix.get() == StringRef() ? normalKeys.end : strinc(addPrefix.get()); + if (fileEnd == (removePrefix.get() == StringRef() ? allKeys.end : strinc(removePrefix.get()))) { + fileEnd = addPrefix.get() == StringRef() ? allKeys.end : strinc(addPrefix.get()); } else { fileEnd = fileEnd.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()); } @@ -3114,7 +3615,6 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { : data[start].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()), (iend == end) ? fileRange.end : data[iend].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())); - tr->clear(trRange); for (; i < iend; ++i) { @@ -3220,7 +3720,7 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } static StringRef name; @@ -3240,7 +3740,7 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreRangeTaskFunc::name = LiteralStringRef("restore_range_data"); +StringRef RestoreRangeTaskFunc::name = "restore_range_data"_sr; REGISTER_TASKFUNC(RestoreRangeTaskFunc); // Decodes a mutation log key, which contains (hash, commitVersion, chunkNumber) and @@ -3335,6 +3835,14 @@ bool AccumulatedMutations::matchesAnyRange(const std::vector& ranges) std::vector mutations = decodeMutationLogValue(serializedMutations); for (auto& m : mutations) { for (auto& r : ranges) { + if (m.type == MutationRef::Encrypted) { + // TODO: In order to filter out encrypted mutations that are not relevant to the + // target range, they would have to be decrypted here in order to check relevance + // below, however the staged mutations would still need to remain encrypted for + // staging into the destination database. Without decrypting, we must assume that + // some data could match the range and return true here. + return true; + } if (m.type == MutationRef::ClearRange) { if (r.intersects(KeyRangeRef(m.param1, m.param2))) { return true; @@ -3430,8 +3938,8 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { state Standalone> dataOriginal = wait(decodeMutationLogFileBlock(inFile, readOffset, readLen)); - // Filter the KV pairs extracted from the log file block to remove any records known to not be needed for this - // restore based on the restore range set. + // Filter the KV pairs extracted from the log file block to remove any records known to not be needed for + // this restore based on the restore range set. state std::vector dataFiltered = filterLogMutationKVPairs(dataOriginal, ranges); state int start = 0; @@ -3505,8 +4013,8 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - // TODO: Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails) will - // never set its taskFuture. + // TODO: Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails) + // will never set its taskFuture. wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task)); return Void(); @@ -3534,7 +4042,7 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } Future execute(Database cx, @@ -3550,7 +4058,7 @@ struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreLogDataTaskFunc::name = LiteralStringRef("restore_log_data"); +StringRef RestoreLogDataTaskFunc::name = "restore_log_data"_sr; REGISTER_TASKFUNC(RestoreLogDataTaskFunc); struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { @@ -3559,11 +4067,11 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { StringRef getName() const override { return name; }; static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginFile() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginBlock() { return LiteralStringRef(__FUNCTION__); } - static TaskParam batchSize() { return LiteralStringRef(__FUNCTION__); } - static TaskParam remainingInBatch() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return __FUNCTION__sr; } + static TaskParam beginFile() { return __FUNCTION__sr; } + static TaskParam beginBlock() { return __FUNCTION__sr; } + static TaskParam batchSize() { return __FUNCTION__sr; } + static TaskParam remainingInBatch() { return __FUNCTION__sr; } } Params; ACTOR static Future _finish(Reference tr, @@ -3614,10 +4122,10 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { } state std::string beginFile = Params.beginFile().getOrDefault(task); - // Get a batch of files. We're targeting batchSize blocks being dispatched so query for batchSize files (each - // of which is 0 or more blocks). + // Get a batch of files. We're targeting batchSize blocks being dispatched so query for batchSize files + // (each of which is 0 or more blocks). state int taskBatchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_DISPATCH_ADDTASK_SIZE; - state RestoreConfig::FileSetT::Values files = wait(restore.fileSet().getRange( + state RestoreConfig::FileSetT::RangeResultType files = wait(restore.fileSet().getRange( tr, Optional({ beginVersion, beginFile }), {}, taskBatchSize)); // allPartsDone will be set once all block tasks in the current batch are finished. @@ -3636,9 +4144,9 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { } // If there were no files to load then this batch is done and restore is almost done. - if (files.size() == 0) { - // If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits for - // them to finish + if (files.results.size() == 0) { + // If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits + // for them to finish if (addingToExistingBatch) { // Setting next begin to restoreVersion + 1 so that any files in the file map at the restore version // won't be dispatched again. @@ -3688,8 +4196,8 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { .detail("Decision", "restore_complete") .detail("TaskInstance", THIS_ADDR); } else { - // Applying of mutations is not yet finished so wait a small amount of time and then re-add this same - // task. + // Applying of mutations is not yet finished so wait a small amount of time and then re-add this + // same task. wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, beginVersion, "", 0, batchSize))); @@ -3714,17 +4222,17 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { // blocks per Dispatch task and target batchSize total per batch but a batch must end on a complete version // boundary so exceed the limit if necessary to reach the end of a version of files. state std::vector> addTaskFutures; - state Version endVersion = files[0].version; + state Version endVersion = files.results[0].version; state int blocksDispatched = 0; state int64_t beginBlock = Params.beginBlock().getOrDefault(task); state int i = 0; - for (; i < files.size(); ++i) { - RestoreConfig::RestoreFile& f = files[i]; + for (; i < files.results.size(); ++i) { + RestoreConfig::RestoreFile& f = files.results[i]; - // Here we are "between versions" (prior to adding the first block of the first file of a new version) so - // this is an opportunity to end the current dispatch batch (which must end on a version boundary) if the - // batch size has been reached or exceeded + // Here we are "between versions" (prior to adding the first block of the first file of a new version) + // so this is an opportunity to end the current dispatch batch (which must end on a version boundary) if + // the batch size has been reached or exceeded if (f.version != endVersion && remainingInBatch <= 0) { // Next start will be at the first version after endVersion at the first file first block ++endVersion; @@ -3786,13 +4294,13 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { .detail("TaskInstance", THIS_ADDR); } - // If no blocks were dispatched then the next dispatch task should run now and be joined with the allPartsDone - // future + // If no blocks were dispatched then the next dispatch task should run now and be joined with the + // allPartsDone future if (blocksDispatched == 0) { std::string decision; - // If no files were dispatched either then the batch size wasn't large enough to catch all of the files at - // the next lowest non-dispatched version, so increase the batch size. + // If no files were dispatched either then the batch size wasn't large enough to catch all of the files + // at the next lowest non-dispatched version, so increase the batch size. if (i == 0) { batchSize *= 2; decision = "increased_batch_size"; @@ -3835,13 +4343,13 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { restore.filesBlocksDispatched().atomicOp(tr, blocksDispatched, MutationRef::Type::AddValue); // If beginFile is not empty then we had to stop in the middle of a version (possibly within a file) so we - // cannot end the batch here because we do not know if we got all of the files and blocks from the last version - // queued, so make sure remainingInBatch is at least 1. + // cannot end the batch here because we do not know if we got all of the files and blocks from the last + // version queued, so make sure remainingInBatch is at least 1. if (!beginFile.empty()) remainingInBatch = std::max(1, remainingInBatch); - // If more blocks need to be dispatched in this batch then add a follow-on task that is part of the allPartsDone - // group which will won't wait to run and will add more block tasks. + // If more blocks need to be dispatched in this batch then add a follow-on task that is part of the + // allPartsDone group which will won't wait to run and will add more block tasks. if (remainingInBatch > 0) addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr, taskBucket, @@ -3918,7 +4426,7 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } Future execute(Database cx, @@ -3934,7 +4442,7 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef RestoreDispatchTaskFunc::name = LiteralStringRef("restore_dispatch"); +StringRef RestoreDispatchTaskFunc::name = "restore_dispatch"_sr; REGISTER_TASKFUNC(RestoreDispatchTaskFunc); ACTOR Future restoreStatus(Reference tr, Key tagName) { @@ -4032,7 +4540,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { static constexpr uint32_t version = 1; static struct { - static TaskParam firstVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam firstVersion() { return __FUNCTION__sr; } } Params; // Find all files needed for the restore and save them in the RestoreConfig for the task. @@ -4098,7 +4606,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { .detail("RestoreVersion", restoreVersion) .detail("Dest", destVersion); if (destVersion <= restoreVersion) { - TEST(true); // Forcing restored cluster to higher version + CODE_PROBE(true, "Forcing restored cluster to higher version"); tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion + 1, Unversioned())); wait(tr->commit()); } else { @@ -4118,7 +4626,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r)); } state Optional restorable = - wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion)); + wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter, logsOnly, beginVersion)); if (!restorable.present()) throw restore_missing_data(); @@ -4130,8 +4638,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { if (!inconsistentSnapshotOnly) { for (const RangeFile& f : restorable.get().ranges) { files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize }); - // In a restore with both snapshots and logs, the firstConsistentVersion is the highest version of - // any range file. + // In a restore with both snapshots and logs, the firstConsistentVersion is the highest version + // of any range file. firstConsistentVersion = std::max(firstConsistentVersion, f.version); } } else { @@ -4250,7 +4758,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { // If this is an incremental restore, we need to set the applyMutationsMapPrefix // to the earliest log version so no mutations are missed Value versionEncoded = BinaryWriter::toValue(Params.firstVersion().get(task), Unversioned()); - wait(krmSetRange(tr, restore.applyMutationsMapPrefix(), normalKeys, versionEncoded)); + wait(krmSetRange(tr, restore.applyMutationsMapPrefix(), allKeys, versionEncoded)); } return Void(); } @@ -4276,7 +4784,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { } wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + return "OnSetAddTask"_sr; } StringRef getName() const override { return name; }; @@ -4294,7 +4802,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { return _finish(tr, tb, fb, task); }; }; -StringRef StartFullRestoreTaskFunc::name = LiteralStringRef("restore_start"); +StringRef StartFullRestoreTaskFunc::name = "restore_start"_sr; REGISTER_TASKFUNC(StartFullRestoreTaskFunc); } // namespace fileBackup @@ -4380,7 +4888,7 @@ public: .detail("OverrideTargetVersion", targetVersion); } - Optional restoreSet = wait(bc->getRestoreSet(targetVersion)); + Optional restoreSet = wait(bc->getRestoreSet(targetVersion, cx)); if (!restoreSet.present()) { TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") @@ -4517,6 +5025,7 @@ public: int snapshotIntervalSeconds, std::string tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone stopWhenDone, UsePartitionedLog partitionedLog, IncrementalBackupOnly incrementalBackupOnly, @@ -4595,24 +5104,28 @@ public: config.clear(tr); state Key destUidValue(BinaryWriter::toValue(uid, Unversioned())); - if (normalizedRanges.size() == 1) { + if (normalizedRanges.size() == 1 || isDefaultBackup(normalizedRanges)) { RangeResult existingDestUidValues = wait( tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); bool found = false; + KeyRangeRef targetRange = + normalizedRanges.size() == 1 ? normalizedRanges[0] : getDefaultBackupSharedRange(); for (auto it : existingDestUidValues) { - if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == - normalizedRanges[0]) { + KeyRange uidRange = + BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + if (uidRange == targetRange) { destUidValue = it.value; found = true; + CODE_PROBE(targetRange == getDefaultBackupSharedRange(), + "Backup mutation sharing with default backup"); break; } } if (!found) { destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned()); - tr->set( - BinaryWriter::toValue(normalizedRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())) - .withPrefix(destUidLookupPrefix), - destUidValue); + tr->set(BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations())) + .withPrefix(destUidLookupPrefix), + destUidValue); } } @@ -4635,6 +5148,7 @@ public: config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds); config.partitionedLogEnabled().set(tr, partitionedLog); config.incrementalBackupOnly().set(tr, incrementalBackupOnly); + config.enableSnapshotBackupEncryption().set(tr, encryptionEnabled); Key taskKey = wait(fileBackup::StartFullBackupTaskFunc::addTask( tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal())); @@ -4887,7 +5401,7 @@ public: tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); try { - tr->set(backupPausedKey, pause ? LiteralStringRef("1") : LiteralStringRef("0")); + tr->set(backupPausedKey, pause ? "1"_sr : "0"_sr); wait(tr->commit()); break; } catch (Error& e) { @@ -5057,11 +5571,11 @@ public: doc.setKey("CurrentSnapshot", snapshot); } - KeyBackedMap>::PairsType errors = + KeyBackedMap>::RangeResultType errors = wait(config.lastErrorPerType().getRange( tr, 0, std::numeric_limits::max(), CLIENT_KNOBS->TOO_MANY)); JsonBuilderArray errorList; - for (auto& e : errors) { + for (auto& e : errors.results) { std::string msg = e.second.first; Version ver = e.second.second; @@ -5209,13 +5723,13 @@ public: // Append the errors, if requested if (showErrors) { - KeyBackedMap>::PairsType errors = + KeyBackedMap>::RangeResultType errors = wait(config.lastErrorPerType().getRange( tr, 0, std::numeric_limits::max(), CLIENT_KNOBS->TOO_MANY)); std::string recentErrors; std::string pastErrors; - for (auto& e : errors) { + for (auto& e : errors.results) { Version v = e.second.second; std::string msg = format( "%s ago : %s\n", @@ -5342,7 +5856,7 @@ public: } Optional restoreSet = - wait(bc->getRestoreSet(targetVersion, ranges, onlyApplyMutationLogs, beginVersion)); + wait(bc->getRestoreSet(targetVersion, cx, ranges, onlyApplyMutationLogs, beginVersion)); if (!restoreSet.present()) { TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") @@ -5624,6 +6138,60 @@ Future FileBackupAgent::restore(Database cx, deterministicRandom()->randomUniqueID()); } +Future FileBackupAgent::restore(Database cx, + Optional cxOrig, + Key tagName, + Key url, + Optional proxy, + WaitForComplete waitForComplete, + Version targetVersion, + Verbose verbose, + KeyRange range, + Key addPrefix, + Key removePrefix, + LockDB lockDB, + OnlyApplyMutationLogs onlyApplyMutationLogs, + InconsistentSnapshotOnly inconsistentSnapshotOnly, + Version beginVersion, + Optional const& encryptionKeyFileName) { + Standalone> rangeRef; + if (range.begin.empty() && range.end.empty()) { + addDefaultBackupRanges(rangeRef); + } else { + rangeRef.push_back_deep(rangeRef.arena(), range); + } + return restore(cx, + cxOrig, + tagName, + url, + proxy, + rangeRef, + waitForComplete, + targetVersion, + verbose, + addPrefix, + removePrefix, + lockDB, + onlyApplyMutationLogs, + inconsistentSnapshotOnly, + beginVersion, + encryptionKeyFileName); +} + +Future FileBackupAgent::atomicRestore(Database cx, + Key tagName, + KeyRange range, + Key addPrefix, + Key removePrefix) { + Standalone> rangeRef; + if (range.begin.empty() && range.end.empty()) { + addDefaultBackupRanges(rangeRef); + } else { + rangeRef.push_back_deep(rangeRef.arena(), range); + } + return atomicRestore(cx, tagName, rangeRef, addPrefix, removePrefix); +} + Future FileBackupAgent::atomicRestore(Database cx, Key tagName, Standalone> ranges, @@ -5656,6 +6224,7 @@ Future FileBackupAgent::submitBackup(Reference int snapshotIntervalSeconds, std::string const& tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone stopWhenDone, UsePartitionedLog partitionedLog, IncrementalBackupOnly incrementalBackupOnly, @@ -5668,6 +6237,7 @@ Future FileBackupAgent::submitBackup(Reference snapshotIntervalSeconds, tagName, backupRanges, + encryptionEnabled, stopWhenDone, partitionedLog, incrementalBackupOnly, @@ -5768,7 +6338,7 @@ ACTOR static Future writeKVs(Database cx, Standalonefirst)) { @@ -153,95 +153,29 @@ void GlobalConfig::erase(KeyRangeRef range) { } } -// Older FDB versions used different keys for client profiling data. This -// function performs a one-time migration of data in these keys to the new -// global configuration key space. -ACTOR Future GlobalConfig::migrate(GlobalConfig* self) { - state Key migratedKey("\xff\x02/fdbClientInfo/migrated/"_sr); - state Reference tr; - try { - state Backoff backoff; - loop { - tr = makeReference(Database(Reference::addRef(self->cx))); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - - try { - state Optional migrated = wait(tr->get(migratedKey)); - if (migrated.present()) { - // Already performed migration. - return Void(); - } - - state Optional sampleRate = - wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr))); - state Optional sizeLimit = - wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr))); - - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); - // The value doesn't matter too much, as long as the key is set. - tr->set(migratedKey.contents(), "1"_sr); - if (sampleRate.present()) { - const double sampleRateDbl = - BinaryReader::fromStringRef(sampleRate.get().contents(), Unversioned()); - Tuple rate = Tuple().appendDouble(sampleRateDbl); - tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack()); - } - if (sizeLimit.present()) { - const int64_t sizeLimitInt = - BinaryReader::fromStringRef(sizeLimit.get().contents(), Unversioned()); - Tuple size = Tuple().append(sizeLimitInt); - tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack()); - } - - wait(tr->commit()); - break; - } catch (Error& e) { - // If multiple fdbserver processes are started at once, they will all - // attempt this migration at the same time, sometimes resulting in - // aborts due to conflicts. Purposefully avoid retrying, making this - // migration best-effort. - TraceEvent(SevInfo, "GlobalConfig_RetryableMigrationError").errorUnsuppressed(e).suppressFor(1.0); - wait(tr->onError(e)); - tr.clear(); - // tr is cleared, so it won't backoff properly. Use custom backoff logic here. - wait(backoff.onError()); - } - } - } catch (Error& e) { - // Catch non-retryable errors (and do nothing). - TraceEvent(SevWarnAlways, "GlobalConfig_MigrationError").error(e); - } - return Void(); -} - // Updates local copy of global configuration by reading the entire key-range -// from storage. -ACTOR Future GlobalConfig::refresh(GlobalConfig* self) { - // TraceEvent trace(SevInfo, "GlobalConfig_Refresh"); +// from storage (proxied through the GrvProxies). +ACTOR Future GlobalConfig::refresh(GlobalConfig* self, Version lastKnown) { + // TraceEvent trace(SevInfo, "GlobalConfigRefresh"); self->erase(KeyRangeRef(""_sr, "\xff"_sr)); - state Backoff backoff; - - state Reference tr; + state Backoff backoff(CLIENT_KNOBS->GLOBAL_CONFIG_REFRESH_BACKOFF, CLIENT_KNOBS->GLOBAL_CONFIG_REFRESH_MAX_BACKOFF); loop { try { - tr = makeReference(Database(Reference::addRef(self->cx))); - tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - RangeResult result = wait(tr->getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY)); - for (const auto& kv : result) { + GlobalConfigRefreshReply reply = + wait(timeoutError(basicLoadBalance(self->cx->getGrvProxies(UseProvisionalProxies::False), + &GrvProxyInterface::refreshGlobalConfig, + GlobalConfigRefreshRequest{ lastKnown }), + CLIENT_KNOBS->GLOBAL_CONFIG_REFRESH_TIMEOUT)); + for (const auto& kv : reply.result) { KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix); self->insert(systemKey, kv.value); } - break; + return Void(); } catch (Error& e) { - TraceEvent("GlobalConfigRefreshError").errorUnsuppressed(e).suppressFor(1.0); - wait(tr->onError(e)); - tr.clear(); - // tr is cleared, so it won't backoff properly. Use custom backoff logic here. wait(backoff.onError()); } } - return Void(); } // Applies updates to the local copy of the global configuration when this @@ -251,9 +185,8 @@ ACTOR Future GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo* try { if (self->initialized.canBeSet()) { wait(self->cx->onConnected()); - wait(self->migrate(self)); - wait(self->refresh(self)); + wait(self->refresh(self, -1)); self->initialized.send(Void()); } @@ -270,7 +203,7 @@ ACTOR Future GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo* // This process missed too many global configuration // history updates or the protocol version changed, so it // must re-read the entire configuration range. - wait(self->refresh(self)); + wait(self->refresh(self, history.back().version)); if (dbInfo->history.size() > 0) { self->lastUpdate = dbInfo->history.back().version; } diff --git a/fdbclient/KeyRangeMap.actor.cpp b/fdbclient/KeyRangeMap.actor.cpp index c736c714bf..a678c28e4a 100644 --- a/fdbclient/KeyRangeMap.actor.cpp +++ b/fdbclient/KeyRangeMap.actor.cpp @@ -23,6 +23,7 @@ #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/ReadYourWrites.h" +#include "flow/UnitTest.h" #include "flow/actorcompiler.h" // has to be last include void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std::vector& affectedRanges) { @@ -35,32 +36,54 @@ void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std affectedRanges.push_back(KeyRangeRef(keys.end, e.end())); } -RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv) { +RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align) { ASSERT(!kv.more || kv.size() > 1); KeyRange withPrefix = KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); - ValueRef beginValue, endValue; - if (kv.size() && kv[0].key.startsWith(mapPrefix)) - beginValue = kv[0].value; - if (kv.size() && kv.end()[-1].key.startsWith(mapPrefix)) - endValue = kv.end()[-1].value; - RangeResult result; result.arena().dependsOn(kv.arena()); result.arena().dependsOn(keys.arena()); - result.push_back(result.arena(), KeyValueRef(keys.begin, beginValue)); + // Always push a kv pair <= keys.begin. + KeyRef beginKey = keys.begin; + if (!align && !kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key < withPrefix.begin) { + beginKey = kv[0].key.removePrefix(mapPrefix); + } + ValueRef beginValue; + if (!kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key <= withPrefix.begin) { + beginValue = kv.front().value; + } + result.push_back(result.arena(), KeyValueRef(beginKey, beginValue)); + for (int i = 0; i < kv.size(); i++) { if (kv[i].key > withPrefix.begin && kv[i].key < withPrefix.end) { KeyRef k = kv[i].key.removePrefix(mapPrefix); result.push_back(result.arena(), KeyValueRef(k, kv[i].value)); - } else if (kv[i].key >= withPrefix.end) + } else if (kv[i].key >= withPrefix.end) { kv.more = false; + // There should be at most 1 value past mapPrefix + keys.end. + ASSERT(i == kv.size() - 1); + break; + } } - if (!kv.more) - result.push_back(result.arena(), KeyValueRef(keys.end, endValue)); + if (!kv.more) { + KeyRef endKey = keys.end; + if (!align && !kv.empty() && kv.back().key.startsWith(mapPrefix) && kv.back().key >= withPrefix.end) { + endKey = kv.back().key.removePrefix(mapPrefix); + } + ValueRef endValue; + if (!kv.empty()) { + // In the aligned case, carry the last value to be the end value. + if (align && kv.back().key.startsWith(mapPrefix) && kv.back().key > withPrefix.end) { + endValue = result.back().value; + } else { + endValue = kv.back().value; + } + } + result.push_back(result.arena(), KeyValueRef(endKey, endValue)); + } result.more = kv.more; return result; @@ -93,6 +116,43 @@ ACTOR Future krmGetRanges(Reference tr, return krmDecodeRanges(mapPrefix, keys, kv); } +// Returns keys.begin, all transitional points in keys, and keys.end, and their values +ACTOR Future krmGetRangesUnaligned(Transaction* tr, + Key mapPrefix, + KeyRange keys, + int limit, + int limitBytes) { + KeyRange withPrefix = + KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); + + state GetRangeLimits limits(limit, limitBytes); + limits.minRows = 2; + // wait to include the next highest row >= keys.end in the result, so since end is exclusive, we need +2 and + // !orEqual + RangeResult kv = + wait(tr->getRange(lastLessOrEqual(withPrefix.begin), KeySelectorRef(withPrefix.end, false, +2), limits)); + + return krmDecodeRanges(mapPrefix, keys, kv, false); +} + +ACTOR Future krmGetRangesUnaligned(Reference tr, + Key mapPrefix, + KeyRange keys, + int limit, + int limitBytes) { + KeyRange withPrefix = + KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); + + state GetRangeLimits limits(limit, limitBytes); + limits.minRows = 2; + // wait to include the next highest row >= keys.end in the result, so since end is exclusive, we need +2 and + // !orEqual + RangeResult kv = + wait(tr->getRange(lastLessOrEqual(withPrefix.begin), KeySelectorRef(withPrefix.end, false, +2), limits)); + + return krmDecodeRanges(mapPrefix, keys, kv, false); +} + void krmSetPreviouslyEmptyRange(Transaction* tr, const KeyRef& mapPrefix, const KeyRangeRef& keys, @@ -186,7 +246,7 @@ static Future krmSetRangeCoalescing_(Transaction* tr, // Determine how far to extend this range at the beginning auto beginRange = keys[0].get(); bool hasBegin = beginRange.size() > 0 && beginRange[0].key.startsWith(mapPrefix); - Value beginValue = hasBegin ? beginRange[0].value : LiteralStringRef(""); + Value beginValue = hasBegin ? beginRange[0].value : ""_sr; state Key beginKey = withPrefix.begin; if (beginValue == value) { @@ -199,7 +259,7 @@ static Future krmSetRangeCoalescing_(Transaction* tr, bool hasEnd = endRange.size() >= 1 && endRange[0].key.startsWith(mapPrefix) && endRange[0].key <= withPrefix.end; bool hasNext = (endRange.size() == 2 && endRange[1].key.startsWith(mapPrefix)) || (endRange.size() == 1 && withPrefix.end < endRange[0].key && endRange[0].key.startsWith(mapPrefix)); - Value existingValue = hasEnd ? endRange[0].value : LiteralStringRef(""); + Value existingValue = hasEnd ? endRange[0].value : ""_sr; bool valueMatches = value == existingValue; KeyRange conflictRange = KeyRangeRef(hasBegin ? beginRange[0].key : mapPrefix, withPrefix.begin); @@ -254,3 +314,107 @@ Future krmSetRangeCoalescing(Reference const& t Value const& value) { return holdWhile(tr, krmSetRangeCoalescing_(tr.getPtr(), mapPrefix, range, maxRange, value)); } + +TEST_CASE("/keyrangemap/decoderange/aligned") { + Arena arena; + Key prefix = "/prefix/"_sr; + StringRef fullKeyA = StringRef(arena, "/prefix/a"_sr); + StringRef fullKeyB = StringRef(arena, "/prefix/b"_sr); + StringRef fullKeyC = StringRef(arena, "/prefix/c"_sr); + StringRef fullKeyD = StringRef(arena, "/prefix/d"_sr); + + StringRef keyA = StringRef(arena, "a"_sr); + StringRef keyB = StringRef(arena, "b"_sr); + StringRef keyC = StringRef(arena, "c"_sr); + StringRef keyD = StringRef(arena, "d"_sr); + StringRef keyE = StringRef(arena, "e"_sr); + StringRef keyAB = StringRef(arena, "ab"_sr); + StringRef keyAC = StringRef(arena, "ac"_sr); + StringRef keyCD = StringRef(arena, "cd"_sr); + + // Fake getRange() call. + RangeResult kv; + kv.push_back(arena, KeyValueRef(fullKeyA, keyA)); + kv.push_back(arena, KeyValueRef(fullKeyB, keyB)); + + // [A, AB(start), AC(start), B] + RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyAC), kv); + ASSERT(decodedRanges.size() == 2); + ASSERT(decodedRanges.front().key == keyAB); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyAC); + ASSERT(decodedRanges.back().value == keyA); + + kv.push_back(arena, KeyValueRef(fullKeyC, keyC)); + kv.push_back(arena, KeyValueRef(fullKeyD, keyD)); + + // [A, AB(start), B, C, CD(end), D] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv); + ASSERT(decodedRanges.size() == 4); + ASSERT(decodedRanges.front().key == keyAB); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyCD); + ASSERT(decodedRanges.back().value == keyC); + + // [""(start), A, B, C, D, E(end)] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv); + ASSERT(decodedRanges.size() == 6); + ASSERT(decodedRanges.front().key == StringRef()); + ASSERT(decodedRanges.front().value == StringRef()); + ASSERT(decodedRanges.back().key == keyE); + ASSERT(decodedRanges.back().value == keyD); + + return Void(); +} + +TEST_CASE("/keyrangemap/decoderange/unaligned") { + Arena arena; + Key prefix = "/prefix/"_sr; + StringRef fullKeyA = StringRef(arena, "/prefix/a"_sr); + StringRef fullKeyB = StringRef(arena, "/prefix/b"_sr); + StringRef fullKeyC = StringRef(arena, "/prefix/c"_sr); + StringRef fullKeyD = StringRef(arena, "/prefix/d"_sr); + + StringRef keyA = StringRef(arena, "a"_sr); + StringRef keyB = StringRef(arena, "b"_sr); + StringRef keyC = StringRef(arena, "c"_sr); + StringRef keyD = StringRef(arena, "d"_sr); + StringRef keyE = StringRef(arena, "e"_sr); + StringRef keyAB = StringRef(arena, "ab"_sr); + StringRef keyAC = StringRef(arena, "ac"_sr); + StringRef keyCD = StringRef(arena, "cd"_sr); + + // Fake getRange() call. + RangeResult kv; + kv.push_back(arena, KeyValueRef(fullKeyA, keyA)); + kv.push_back(arena, KeyValueRef(fullKeyB, keyB)); + + // [A, AB(start), AC(start), B] + RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyAC), kv, false); + ASSERT(decodedRanges.size() == 2); + ASSERT(decodedRanges.front().key == keyA); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyB); + ASSERT(decodedRanges.back().value == keyB); + + kv.push_back(arena, KeyValueRef(fullKeyC, keyC)); + kv.push_back(arena, KeyValueRef(fullKeyD, keyD)); + + // [A, AB(start), B, C, CD(end), D] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv, false); + ASSERT(decodedRanges.size() == 4); + ASSERT(decodedRanges.front().key == keyA); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyD); + ASSERT(decodedRanges.back().value == keyD); + + // [""(start), A, B, C, D, E(end)] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv, false); + ASSERT(decodedRanges.size() == 6); + ASSERT(decodedRanges.front().key == StringRef()); + ASSERT(decodedRanges.front().value == StringRef()); + ASSERT(decodedRanges.back().key == keyE); + ASSERT(decodedRanges.back().value == keyD); + + return Void(); +} \ No newline at end of file diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 0ff62e276f..4119940d44 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -22,6 +22,7 @@ #include #include +#include "fdbclient/GenericManagementAPI.actor.h" #include "fmt/format.h" #include "fdbclient/Knobs.h" #include "flow/Arena.h" @@ -200,6 +201,20 @@ std::map configForToken(std::string const& mode) { } out[p + key] = format("%d", tenantMode); } + + if (key == "encryption_at_rest_mode") { + EncryptionAtRestMode mode; + if (value == "disabled") { + mode = EncryptionAtRestMode::DISABLED; + } else if (value == "aes_256_ctr") { + mode = EncryptionAtRestMode::AES_256_CTR; + } else { + printf("Error: Only disabled|aes_256_ctr are valid for encryption_at_rest_mode.\n"); + return out; + } + out[p + key] = format("%d", mode); + } + return out; } @@ -803,6 +818,8 @@ ACTOR Future> getConnectionString(Database cx) } } +static std::vector connectionStrings; + namespace { ACTOR Future> getClusterConnectionStringFromStorageServer(Transaction* tr) { @@ -820,6 +837,19 @@ ACTOR Future> getClusterConnectionStringFromSt Version readVersion = wait(tr->getReadVersion()); state Optional currentKey = wait(tr->get(coordinatorsKey)); + if (g_network->isSimulated() && currentKey.present()) { + // If the change coordinators request succeeded, the coordinators + // should have changed to the connection string of the most + // recently issued request. If instead the connection string is + // equal to one of the previously issued requests, there is a bug + // and we are breaking the promises we make with + // commit_unknown_result (the transaction must no longer be in + // progress when receiving commit_unknown_result). + int n = connectionStrings.size() > 0 ? connectionStrings.size() - 1 : 0; // avoid underflow + for (int i = 0; i < n; ++i) { + ASSERT(currentKey.get() != connectionStrings.at(i)); + } + } if (!currentKey.present()) { // Someone deleted this key entirely? @@ -842,12 +872,59 @@ ACTOR Future> getClusterConnectionStringFromSt } } +ACTOR Future verifyConfigurationDatabaseAlive(Database cx) { + state Backoff backoff; + state Reference configTr; + loop { + try { + // Attempt to read a random value from the configuration + // database to make sure it is online. + configTr = ISingleThreadTransaction::create(ISingleThreadTransaction::Type::PAXOS_CONFIG, cx); + Tuple tuple; + tuple.appendNull(); // config class + tuple << "test"_sr; + Optional serializedValue = wait(configTr->get(tuple.pack())); + TraceEvent("ChangeQuorumCheckerNewCoordinatorsOnline").log(); + return Void(); + } catch (Error& e) { + TraceEvent("ChangeQuorumCheckerNewCoordinatorsError").error(e); + if (e.code() == error_code_coordinators_changed) { + wait(backoff.onError()); + configTr->reset(); + } else { + wait(configTr->onError(e)); + } + } + } +} + +ACTOR Future resetPreviousCoordinatorsKey(Database cx) { + loop { + // When the change coordinators transaction succeeds, it uses the + // special key space error message to return a message to the client. + // This causes the underlying transaction to not be committed. In order + // to make sure we clear the previous coordinators key, we have to use + // a new transaction here. + state Reference clearTr = + ISingleThreadTransaction::create(ISingleThreadTransaction::Type::RYW, cx); + try { + clearTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + clearTr->clear(previousCoordinatorsKey); + wait(clearTr->commit()); + return Void(); + } catch (Error& e2) { + wait(clearTr->onError(e2)); + } + } +} + } // namespace ACTOR Future> changeQuorumChecker(Transaction* tr, ClusterConnectionString* conn, - std::string newName) { - + std::string newName, + bool disableConfigDB) { + TraceEvent("ChangeQuorumCheckerStart").detail("NewConnectionString", conn->toString()); state Optional clusterConnectionStringOptional = wait(getClusterConnectionStringFromStorageServer(tr)); @@ -862,7 +939,7 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, conn->hostnames = old.hostnames; conn->coords = old.coords; } - std::vector desiredCoordinators = wait(conn->tryResolveHostnames()); + state std::vector desiredCoordinators = wait(conn->tryResolveHostnames()); if (desiredCoordinators.size() != conn->hostnames.size() + conn->coords.size()) { TraceEvent("ChangeQuorumCheckerEarlyTermination") .detail("Reason", "One or more hostnames are unresolvable") @@ -878,16 +955,25 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, std::sort(old.hostnames.begin(), old.hostnames.end()); std::sort(old.coords.begin(), old.coords.end()); if (conn->hostnames == old.hostnames && conn->coords == old.coords && old.clusterKeyName() == newName) { + connectionStrings.clear(); + if (g_network->isSimulated() && g_simulator->configDBType == ConfigDBType::DISABLED) { + disableConfigDB = true; + } + if (!disableConfigDB) { + wait(verifyConfigurationDatabaseAlive(tr->getDatabase())); + } + wait(resetPreviousCoordinatorsKey(tr->getDatabase())); return CoordinatorsResult::SAME_NETWORK_ADDRESSES; } conn->parseKey(newName + ':' + deterministicRandom()->randomAlphaNumeric(32)); + connectionStrings.push_back(conn->toString()); if (g_network->isSimulated()) { int i = 0; int protectedCount = 0; while ((protectedCount < ((desiredCoordinators.size() / 2) + 1)) && (i < desiredCoordinators.size())) { - auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]); + auto process = g_simulator->getProcessByAddress(desiredCoordinators[i]); auto addresses = process->addresses; if (!process->isReliable()) { @@ -895,9 +981,9 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, continue; } - g_simulator.protectedAddresses.insert(process->addresses.address); + g_simulator->protectedAddresses.insert(process->addresses.address); if (addresses.secondaryAddress.present()) { - g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get()); + g_simulator->protectedAddresses.insert(process->addresses.secondaryAddress.get()); } TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace(); protectedCount++; @@ -924,8 +1010,13 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, choose { when(wait(waitForAll(leaderServers))) {} - when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; } + when(wait(delay(5.0))) { + return CoordinatorsResult::COORDINATOR_UNREACHABLE; + } } + TraceEvent("ChangeQuorumCheckerSetCoordinatorsKey") + .detail("CurrentCoordinators", old.toString()) + .detail("NewCoordinators", conn->toString()); tr->set(coordinatorsKey, conn->toString()); return Optional(); } @@ -988,12 +1079,12 @@ ACTOR Future changeQuorum(Database cx, ReferenceisSimulated()) { for (int i = 0; i < (desiredCoordinators.size() / 2) + 1; i++) { - auto process = g_simulator.getProcessByAddress(desiredCoordinators[i]); + auto process = g_simulator->getProcessByAddress(desiredCoordinators[i]); ASSERT(process->isReliable() || process->rebooting); - g_simulator.protectedAddresses.insert(process->addresses.address); + g_simulator->protectedAddresses.insert(process->addresses.address); if (process->addresses.secondaryAddress.present()) { - g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get()); + g_simulator->protectedAddresses.insert(process->addresses.secondaryAddress.get()); } TraceEvent("ProtectCoordinator").detail("Address", desiredCoordinators[i]).backtrace(); } @@ -1002,8 +1093,8 @@ ACTOR Future changeQuorum(Database cx, Reference>> leaderServers; state ClientCoordinators coord(Reference( @@ -1023,7 +1114,9 @@ ACTOR Future changeQuorum(Database cx, Reference getRedundancy(AutoQuorumChange* self, Transaction* tr) { - state Future> fStorageReplicas = - tr->get(LiteralStringRef("storage_replicas").withPrefix(configKeysPrefix)); - state Future> fLogReplicas = - tr->get(LiteralStringRef("log_replicas").withPrefix(configKeysPrefix)); + state Future> fStorageReplicas = tr->get("storage_replicas"_sr.withPrefix(configKeysPrefix)); + state Future> fLogReplicas = tr->get("log_replicas"_sr.withPrefix(configKeysPrefix)); wait(success(fStorageReplicas) && success(fLogReplicas)); int redundancy = std::min(atoi(fStorageReplicas.get().get().toString().c_str()), atoi(fLogReplicas.get().get().toString().c_str())); @@ -1232,10 +1323,7 @@ struct AutoQuorumChange final : IQuorumChange { std::map> currentCounts; std::map hardLimits; - std::vector fields({ LiteralStringRef("dcid"), - LiteralStringRef("data_hall"), - LiteralStringRef("zoneid"), - LiteralStringRef("machineid") }); + std::vector fields({ "dcid"_sr, "data_hall"_sr, "zoneid"_sr, "machineid"_sr }); for (auto field = fields.begin(); field != fields.end(); field++) { if (field->toString() == "zoneid") { @@ -1252,7 +1340,7 @@ struct AutoQuorumChange final : IQuorumChange { continue; } // Exclude faulty node due to machine assassination - if (g_network->isSimulated() && !g_simulator.getProcessByAddress(worker->address)->isReliable()) { + if (g_network->isSimulated() && !g_simulator->getProcessByAddress(worker->address)->isReliable()) { TraceEvent("AutoSelectCoordinators").detail("SkipUnreliableWorker", worker->address.toString()); continue; } @@ -1261,7 +1349,7 @@ struct AutoQuorumChange final : IQuorumChange { if (maxCounts[*field] == 0) { maxCounts[*field] = 1; } - auto value = worker->locality.get(*field).orDefault(LiteralStringRef("")); + auto value = worker->locality.get(*field).orDefault(""_sr); auto currentCount = currentCounts[*field][value]; if (currentCount >= maxCounts[*field]) { valid = false; @@ -1270,7 +1358,7 @@ struct AutoQuorumChange final : IQuorumChange { } if (valid) { for (auto field = fields.begin(); field != fields.end(); field++) { - auto value = worker->locality.get(*field).orDefault(LiteralStringRef("")); + auto value = worker->locality.get(*field).orDefault(""_sr); currentCounts[*field][value] += 1; } chosen.push_back(worker->address); @@ -1323,6 +1411,7 @@ ACTOR Future excludeServers(Database cx, std::vector ser state ReadYourWritesTransaction ryw(cx); loop { try { + ryw.setOption(FDBTransactionOptions::RAW_ACCESS); ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); ryw.set( SpecialKeySpace::getManagementApiCommandOptionSpecialKey(failed ? "failed" : "excluded", "force"), @@ -1385,6 +1474,7 @@ ACTOR Future excludeLocalities(Database cx, std::unordered_set includeServers(Database cx, std::vector ser state ReadYourWritesTransaction ryw(cx); loop { try { + ryw.setOption(FDBTransactionOptions::RAW_ACCESS); ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); for (auto& s : servers) { if (!s.isValid()) { @@ -1449,8 +1540,7 @@ ACTOR Future includeServers(Database cx, std::vector ser // This is why we now make two clears: first only of the ip // address, the second will delete all ports. if (s.isWholeMachine()) - ryw.clear(KeyRangeRef(addr.withSuffix(LiteralStringRef(":")), - addr.withSuffix(LiteralStringRef(";")))); + ryw.clear(KeyRangeRef(addr.withSuffix(":"_sr), addr.withSuffix(";"_sr))); } } TraceEvent("IncludeServersCommit").detail("Servers", describe(servers)).detail("Failed", failed); @@ -1530,6 +1620,7 @@ ACTOR Future includeLocalities(Database cx, std::vector local state ReadYourWritesTransaction ryw(cx); loop { try { + ryw.setOption(FDBTransactionOptions::RAW_ACCESS); ryw.setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); if (includeAll) { if (failed) { @@ -2029,9 +2120,7 @@ ACTOR Future lockDatabase(Transaction* tr, UID id) { } tr->atomicOp(databaseLockedKey, - BinaryWriter::toValue(id, Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), + BinaryWriter::toValue(id, Unversioned()).withPrefix("0123456789"_sr).withSuffix("\x00\x00\x00\x00"_sr), MutationRef::SetVersionstampedValue); tr->addWriteConflictRange(normalKeys); return Void(); @@ -2052,9 +2141,7 @@ ACTOR Future lockDatabase(Reference tr, UID id) } tr->atomicOp(databaseLockedKey, - BinaryWriter::toValue(id, Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), + BinaryWriter::toValue(id, Unversioned()).withPrefix("0123456789"_sr).withSuffix("\x00\x00\x00\x00"_sr), MutationRef::SetVersionstampedValue); tr->addWriteConflictRange(normalKeys); return Void(); @@ -2173,7 +2260,7 @@ ACTOR Future updateChangeFeed(Transaction* tr, Key rangeID, ChangeFeedStat } else if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { if (val.present()) { if (g_network->isSimulated()) { - g_simulator.validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); + g_simulator->validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); } tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), @@ -2211,7 +2298,7 @@ ACTOR Future updateChangeFeed(Reference tr, } else if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { if (val.present()) { if (g_network->isSimulated()) { - g_simulator.validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); + g_simulator->validationData.allDestroyedChangeFeedIDs.insert(rangeID.toString()); } tr->set(rangeIDKey, changeFeedValue(std::get<0>(decodeChangeFeedValue(val.get())), @@ -2461,6 +2548,21 @@ bool schemaMatch(json_spirit::mValue const& schemaValue, } } +void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + auto key = storageQuotaKey(tenantName); + tr.set(key, BinaryWriter::toValue(quota, Unversioned())); +} + +ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state Optional v = wait(tr->get(storageQuotaKey(tenantName))); + if (!v.present()) { + return Optional(); + } + return BinaryReader::fromStringRef(v.get(), Unversioned()); +} + std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) { // Note: the error message here should not be changed if possible // If you do change the message here, @@ -2509,24 +2611,24 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { auto dataHall = dataCenter + std::to_string(i / 2 % 2); auto rack = dataHall + std::to_string(i % 2); auto machineId = rack + std::to_string(i); - data.locality.set(LiteralStringRef("dcid"), StringRef(dataCenter)); - data.locality.set(LiteralStringRef("data_hall"), StringRef(dataHall)); - data.locality.set(LiteralStringRef("rack"), StringRef(rack)); - data.locality.set(LiteralStringRef("zoneid"), StringRef(rack)); - data.locality.set(LiteralStringRef("machineid"), StringRef(machineId)); + data.locality.set("dcid"_sr, StringRef(dataCenter)); + data.locality.set("data_hall"_sr, StringRef(dataHall)); + data.locality.set("rack"_sr, StringRef(rack)); + data.locality.set("zoneid"_sr, StringRef(rack)); + data.locality.set("machineid"_sr, StringRef(machineId)); data.address.ip = IPAddress(i); if (g_network->isSimulated()) { - g_simulator.newProcess("TestCoordinator", - data.address.ip, - data.address.port, - false, - 1, - data.locality, - ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource), - "", - "", - currentProtocolVersion); + g_simulator->newProcess("TestCoordinator", + data.address.ip, + data.address.port, + false, + 1, + data.locality, + ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource), + "", + "", + currentProtocolVersion()); } workers.push_back(data); @@ -2539,10 +2641,7 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { std::map> chosenValues; ASSERT(chosen.size() == 5); - std::vector fields({ LiteralStringRef("dcid"), - LiteralStringRef("data_hall"), - LiteralStringRef("zoneid"), - LiteralStringRef("machineid") }); + std::vector fields({ "dcid"_sr, "data_hall"_sr, "zoneid"_sr, "machineid"_sr }); for (auto worker = chosen.begin(); worker != chosen.end(); worker++) { ASSERT(worker->ip.toV4() < workers.size()); LocalityData data = workers[worker->ip.toV4()].locality; @@ -2551,10 +2650,10 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { } } - ASSERT(chosenValues[LiteralStringRef("dcid")].size() == 2); - ASSERT(chosenValues[LiteralStringRef("data_hall")].size() == 4); - ASSERT(chosenValues[LiteralStringRef("zoneid")].size() == 5); - ASSERT(chosenValues[LiteralStringRef("machineid")].size() == 5); + ASSERT(chosenValues["dcid"_sr].size() == 2); + ASSERT(chosenValues["data_hall"_sr].size() == 4); + ASSERT(chosenValues["zoneid"_sr].size() == 5); + ASSERT(chosenValues["machineid"_sr].size() == 5); ASSERT(std::find(chosen.begin(), chosen.end(), workers[noAssignIndex].address) != chosen.end()); return Void(); diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp new file mode 100644 index 0000000000..6463033db8 --- /dev/null +++ b/fdbclient/Metacluster.cpp @@ -0,0 +1,71 @@ +/* + * Metacluster.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/Metacluster.h" +#include "fdbclient/MetaclusterManagement.actor.h" + +FDB_DEFINE_BOOLEAN_PARAM(AddNewTenants); +FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants); + +std::string DataClusterEntry::clusterStateToString(DataClusterState clusterState) { + switch (clusterState) { + case DataClusterState::READY: + return "ready"; + case DataClusterState::REMOVING: + return "removing"; + case DataClusterState::RESTORING: + return "restoring"; + default: + UNREACHABLE(); + } +} + +DataClusterState DataClusterEntry::stringToClusterState(std::string stateStr) { + if (stateStr == "ready") { + return DataClusterState::READY; + } else if (stateStr == "removing") { + return DataClusterState::REMOVING; + } else if (stateStr == "restoring") { + return DataClusterState::RESTORING; + } + + UNREACHABLE(); +} + +json_spirit::mObject DataClusterEntry::toJson() const { + json_spirit::mObject obj; + obj["capacity"] = capacity.toJson(); + obj["allocated"] = allocated.toJson(); + obj["cluster_state"] = DataClusterEntry::clusterStateToString(clusterState); + return obj; +} + +json_spirit::mObject ClusterUsage::toJson() const { + json_spirit::mObject obj; + obj["num_tenant_groups"] = numTenantGroups; + return obj; +} + +KeyBackedObjectProperty& +MetaclusterMetadata::metaclusterRegistration() { + static KeyBackedObjectProperty instance( + "\xff/metacluster/clusterRegistration"_sr, IncludeVersion()); + return instance; +} \ No newline at end of file diff --git a/fdbclient/MetaclusterManagement.actor.cpp b/fdbclient/MetaclusterManagement.actor.cpp new file mode 100644 index 0000000000..33403300bd --- /dev/null +++ b/fdbclient/MetaclusterManagement.actor.cpp @@ -0,0 +1,67 @@ +/* + * MetaclusterManagement.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/ThreadSafeTransaction.h" +#include "flow/actorcompiler.h" // has to be last include + +namespace MetaclusterAPI { + +ACTOR Future> openDatabase(ClusterConnectionString connectionString) { + if (g_network->isSimulated()) { + Reference clusterFile = + makeReference(connectionString); + Database nativeDb = Database::createDatabase(clusterFile, -1); + Reference threadSafeDb = + wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(nativeDb))); + return MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeDb); + } else { + return MultiVersionApi::api->createDatabaseFromConnectionString(connectionString.toString().c_str()); + } +} + +KeyBackedObjectMap& +ManagementClusterMetadata::dataClusters() { + static KeyBackedObjectMap instance( + "metacluster/dataCluster/metadata/"_sr, IncludeVersion()); + return instance; +} + +KeyBackedMap, + ManagementClusterMetadata::ConnectionStringCodec> + ManagementClusterMetadata::dataClusterConnectionRecords("metacluster/dataCluster/connectionString/"_sr); + +KeyBackedSet ManagementClusterMetadata::clusterCapacityIndex("metacluster/clusterCapacityIndex/"_sr); +KeyBackedMap, BinaryCodec> + ManagementClusterMetadata::clusterTenantCount("metacluster/clusterTenantCount/"_sr); +KeyBackedSet ManagementClusterMetadata::clusterTenantIndex("metacluster/dataCluster/tenantMap/"_sr); +KeyBackedSet ManagementClusterMetadata::clusterTenantGroupIndex("metacluster/dataCluster/tenantGroupMap/"_sr); + +TenantMetadataSpecification& ManagementClusterMetadata::tenantMetadata() { + static TenantMetadataSpecification instance(""_sr); + return instance; +} + +}; // namespace MetaclusterAPI \ No newline at end of file diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index f0215ea36d..6c34369e6a 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -248,7 +248,7 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString/hostname") { hostnames.push_back(Hostname::parse(hn1 + ":" + port1)); hostnames.push_back(Hostname::parse(hn2 + ":" + port2)); - ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0")); + ClusterConnectionString cs(hostnames, "TestCluster:0"_sr); ASSERT(cs.hostnames.size() == 2); ASSERT(cs.coords.size() == 0); ASSERT(cs.toString() == connectionString); @@ -259,7 +259,7 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString/hostname") { hostnames.push_back(Hostname::parse(hn1 + ":" + port1)); hostnames.push_back(Hostname::parse(hn1 + ":" + port1)); try { - ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0")); + ClusterConnectionString cs(hostnames, "TestCluster:0"_sr); } catch (Error& e) { ASSERT(e.code() == error_code_connection_string_invalid); } @@ -367,7 +367,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") { auto c = connectionString.begin(); while (c != connectionString.end()) { if (deterministicRandom()->random01() < 0.1) // Add whitespace character - output += deterministicRandom()->randomChoice(LiteralStringRef(" \t\n\r")); + output += deterministicRandom()->randomChoice(" \t\n\r"_sr); if (deterministicRandom()->random01() < 0.5) { // Add one of the input characters output += *c; ++c; @@ -376,9 +376,9 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") { output += "#"; int charCount = deterministicRandom()->randomInt(0, 20); for (int i = 0; i < charCount; i++) { - output += deterministicRandom()->randomChoice(LiteralStringRef("asdfzxcv123345:!@#$#$&()<\"\' \t")); + output += deterministicRandom()->randomChoice("asdfzxcv123345:!@#$#$&()<\"\' \t"_sr); } - output += deterministicRandom()->randomChoice(LiteralStringRef("\n\r")); + output += deterministicRandom()->randomChoice("\n\r"_sr); } } @@ -501,6 +501,7 @@ ACTOR Future monitorNominee(Key key, Optional* info) { loop { state Optional li; + wait(Future(Void())); // Make sure we weren't cancelled if (coord.hostname.present()) { wait(store(li, retryGetReplyFromHostname(GetLeaderRequest(key, info->present() ? info->get().changeID : UID()), @@ -663,69 +664,43 @@ ACTOR Future asyncDeserializeClusterInterface(Reference> s } } -struct ClientStatusStats { - int count; - std::vector> examples; +namespace { - ClientStatusStats() : count(0) { examples.reserve(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT); } -}; +void tryInsertIntoSamples(OpenDatabaseRequest::Samples& samples, + const NetworkAddress& networkAddress, + const Key& traceLogGroup) { + ++samples.count; + if (samples.samples.size() < static_cast(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT)) { + samples.samples.insert({ networkAddress, traceLogGroup }); + } +} + +} // namespace OpenDatabaseRequest ClientData::getRequest() { OpenDatabaseRequest req; - std::map issueMap; - std::map versionMap; - std::map maxProtocolMap; - int clientCount = 0; - - // SOMEDAY: add a yield in this loop for (auto& ci : clientStatusInfoMap) { - for (auto& it : ci.second.issues) { - auto& entry = issueMap[it]; - entry.count++; - if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - entry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } - if (ci.second.versions.size()) { - clientCount++; - StringRef maxProtocol; - for (auto& it : ci.second.versions) { - maxProtocol = std::max(maxProtocol, it.protocolVersion); - auto& entry = versionMap[it]; - entry.count++; - if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - entry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } - auto& maxEntry = maxProtocolMap[maxProtocol]; - maxEntry.count++; - if (maxEntry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - maxEntry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } else { - auto& entry = versionMap[ClientVersionRef()]; - entry.count++; - if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - entry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } - } + const auto& networkAddress = ci.first; + const auto& traceLogGroup = ci.second.traceLogGroup; - req.issues.reserve(issueMap.size()); - for (auto& it : issueMap) { - req.issues.push_back(ItemWithExamples(it.first, it.second.count, it.second.examples)); + for (auto& issue : ci.second.issues) { + tryInsertIntoSamples(req.issues[issue], networkAddress, traceLogGroup); + } + + if (!ci.second.versions.size()) { + tryInsertIntoSamples(req.supportedVersions[ClientVersionRef()], networkAddress, traceLogGroup); + continue; + } + + ++req.clientCount; + StringRef maxProtocol; + for (auto& it : ci.second.versions) { + maxProtocol = std::max(maxProtocol, it.protocolVersion); + tryInsertIntoSamples(req.supportedVersions[it], networkAddress, traceLogGroup); + } + tryInsertIntoSamples(req.maxProtocolSupported[maxProtocol], networkAddress, traceLogGroup); } - req.supportedVersions.reserve(versionMap.size()); - for (auto& it : versionMap) { - req.supportedVersions.push_back( - ItemWithExamples>(it.first, it.second.count, it.second.examples)); - } - req.maxProtocolSupported.reserve(maxProtocolMap.size()); - for (auto& it : maxProtocolMap) { - req.maxProtocolSupported.push_back(ItemWithExamples(it.first, it.second.count, it.second.examples)); - } - req.clientCount = clientCount; return req; } @@ -887,6 +862,7 @@ ACTOR Future monitorProxiesOneGeneration( for (const auto& c : cs.coords) { clientLeaderServers.push_back(ClientLeaderRegInterface(c)); } + ASSERT(clientLeaderServers.size() > 0); deterministicRandom()->randomShuffle(clientLeaderServers); @@ -906,7 +882,7 @@ ACTOR Future monitorProxiesOneGeneration( bool upToDate = wait(connRecord->upToDate(storedConnectionString)); if (upToDate) { incorrectTime = Optional(); - } else if (allConnectionsFailed) { + } else if (allConnectionsFailed && storedConnectionString.getNumberOfCoordinators() > 0) { // Failed to connect to all coordinators from the current connection string, // so it is not possible to get any new updates from the cluster. It can be that // all the coordinators have changed, but the client missed that, because it had @@ -920,7 +896,7 @@ ACTOR Future monitorProxiesOneGeneration( info.intermediateConnRecord = connRecord; return info; } else { - req.issues.push_back_deep(req.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents")); + req.issues.push_back_deep(req.issues.arena(), "incorrect_cluster_file_contents"_sr); std::string connectionString = connRecord->getConnectionString().toString(); if (!incorrectTime.present()) { incorrectTime = now(); @@ -964,6 +940,7 @@ ACTOR Future monitorProxiesOneGeneration( .detail("OldConnStr", info.intermediateConnRecord->getConnectionString().toString()); info.intermediateConnRecord = connRecord->makeIntermediateRecord( ClusterConnectionString(rep.get().read().forward.get().toString())); + ASSERT(info.intermediateConnRecord->getConnectionString().getNumberOfCoordinators() > 0); return info; } if (connRecord != info.intermediateConnRecord) { @@ -987,8 +964,8 @@ ACTOR Future monitorProxiesOneGeneration( successIndex = index; allConnectionsFailed = false; } else { - TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller - TEST(rep.getError().code() == error_code_lookup_failed); // Coordinator hostname resolving failure + CODE_PROBE(rep.getError().code() == error_code_failed_to_progress, + "Coordinator cant talk to cluster controller"); TraceEvent("MonitorProxiesConnectFailed") .detail("Error", rep.getError().name()) .detail("Coordinator", clientLeaderServer.getAddressString()); @@ -1009,6 +986,7 @@ ACTOR Future monitorProxies( Key traceLogGroup) { state MonitorLeaderInfo info(connRecord->get()); loop { + ASSERT(connRecord->get().isValid()); choose { when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration( connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) { diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index b08457a3ac..775cfa54d0 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -257,13 +257,14 @@ ThreadFuture>> DLTransaction::getRangeSplitPoints(c }); } -ThreadFuture>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange) { +ThreadFuture>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) { if (!api->transactionGetBlobGranuleRanges) { return unsupported_operation(); } FdbCApi::FDBFuture* f = api->transactionGetBlobGranuleRanges( - tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); + tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit); return toThreadFuture>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { const FdbCApi::FDBKeyRange* keyRanges; int keyRangesLength; @@ -279,10 +280,46 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) { - if (!api->transactionReadBlobGranules) { + return unsupported_operation(); +} + +ThreadFuture>> DLTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { + if (!api->transactionReadBlobGranulesStart) { return unsupported_operation(); } + int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; + + FdbCApi::FDBFuture* f = api->transactionReadBlobGranulesStart(tr, + keyRange.begin.begin(), + keyRange.begin.size(), + keyRange.end.begin(), + keyRange.end.size(), + beginVersion, + rv, + readVersionOut); + + return ThreadFuture>>( + (ThreadSingleAssignmentVar>>*)(f)); +}; + +ThreadResult DLTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { + if (!api->transactionReadBlobGranulesFinish) { + return unsupported_operation(); + } + + // convert back to fdb future for API + FdbCApi::FDBFuture* f = (FdbCApi::FDBFuture*)(startFuture.extractPtr()); + // FIXME: better way to convert here? FdbCApi::FDBReadBlobGranuleContext context; context.userContext = granuleContext.userContext; @@ -292,17 +329,40 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key context.debugNoMaterialize = granuleContext.debugNoMaterialize; context.granuleParallelism = granuleContext.granuleParallelism; - int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; + FdbCApi::FDBResult* r = api->transactionReadBlobGranulesFinish(tr, + f, + keyRange.begin.begin(), + keyRange.begin.size(), + keyRange.end.begin(), + keyRange.end.size(), + beginVersion, + readVersion, + &context); - FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr, - keyRange.begin.begin(), - keyRange.begin.size(), - keyRange.end.begin(), - keyRange.end.size(), - beginVersion, - rv, - context); return ThreadResult((ThreadSingleAssignmentVar*)(r)); +}; + +ThreadFuture>> +DLTransaction::summarizeBlobGranules(const KeyRangeRef& keyRange, Optional summaryVersion, int rangeLimit) { + if (!api->transactionSummarizeBlobGranules) { + return unsupported_operation(); + } + + int64_t sv = summaryVersion.present() ? summaryVersion.get() : latestVersion; + + FdbCApi::FDBFuture* f = api->transactionSummarizeBlobGranules( + tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), sv, rangeLimit); + + return toThreadFuture>>( + api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + const FdbCApi::FDBGranuleSummary* summaries; + int summariesLength; + FdbCApi::fdb_error_t error = api->futureGetGranuleSummaryArray(f, &summaries, &summariesLength); + ASSERT(!error); + // The memory for this is stored in the FDBFuture and is released when the future gets destroyed + return Standalone>( + VectorRef((BlobGranuleSummaryRef*)summaries, summariesLength), Arena()); + }); } void DLTransaction::addReadConflictRange(const KeyRangeRef& keys) { @@ -583,6 +643,73 @@ ThreadFuture DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); }); } +ThreadFuture DLDatabase::blobbifyRange(const KeyRangeRef& keyRange) { + if (!api->databaseBlobbifyRange) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseBlobbifyRange( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); + + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + FdbCApi::fdb_bool_t ret = false; + ASSERT(!api->futureGetBool(f, &ret)); + return ret; + }); +} + +ThreadFuture DLDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { + if (!api->databaseUnblobbifyRange) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseUnblobbifyRange( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); + + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + FdbCApi::fdb_bool_t ret = false; + ASSERT(!api->futureGetBool(f, &ret)); + return ret; + }); +} + +ThreadFuture>> DLDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) { + if (!api->databaseListBlobbifiedRanges) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseListBlobbifiedRanges( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit); + + return toThreadFuture>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + const FdbCApi::FDBKeyRange* keyRanges; + int keyRangesLength; + FdbCApi::fdb_error_t error = api->futureGetKeyRangeArray(f, &keyRanges, &keyRangesLength); + ASSERT(!error); + // The memory for this is stored in the FDBFuture and is released when the future gets destroyed. + return Standalone>(VectorRef((KeyRangeRef*)keyRanges, keyRangesLength), + Arena()); + }); +} + +ThreadFuture DLDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { + if (!api->databaseVerifyBlobRange) { + return unsupported_operation(); + } + + Version readVersion = version.present() ? version.get() : latestVersion; + + FdbCApi::FDBFuture* f = api->databaseVerifyBlobRange( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), readVersion); + + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + Version version = invalidVersion; + ASSERT(!api->futureGetInt64(f, &version)); + return version; + }); +} + // DLApi // Loads the specified function from a dynamic library @@ -626,6 +753,11 @@ void DLApi::init() { loadClientFunction(&api->selectApiVersion, lib, fdbCPath, "fdb_select_api_version_impl", headerVersion >= 0); loadClientFunction(&api->getClientVersion, lib, fdbCPath, "fdb_get_client_version", headerVersion >= 410); + loadClientFunction(&api->useFutureProtocolVersion, + lib, + fdbCPath, + "fdb_use_future_protocol_version", + headerVersion >= ApiVersion::withFutureProtocolVersionApi().version()); loadClientFunction(&api->setNetworkOption, lib, fdbCPath, "fdb_network_set_option", headerVersion >= 0); loadClientFunction(&api->setupNetwork, lib, fdbCPath, "fdb_setup_network", headerVersion >= 0); loadClientFunction(&api->runNetwork, lib, fdbCPath, "fdb_run_network", headerVersion >= 0); @@ -635,7 +767,7 @@ void DLApi::init() { lib, fdbCPath, "fdb_create_database_from_connection_string", - headerVersion >= 720); + headerVersion >= ApiVersion::withCreateDBFromConnString().version()); loadClientFunction(&api->databaseOpenTenant, lib, fdbCPath, "fdb_database_open_tenant", headerVersion >= 710); loadClientFunction( @@ -668,16 +800,39 @@ void DLApi::init() { fdbCPath, "fdb_database_wait_purge_granules_complete", headerVersion >= 710); + loadClientFunction(&api->databaseBlobbifyRange, + lib, + fdbCPath, + "fdb_database_blobbify_range", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->databaseUnblobbifyRange, + lib, + fdbCPath, + "fdb_database_unblobbify_range", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->databaseListBlobbifiedRanges, + lib, + fdbCPath, + "fdb_database_list_blobbified_ranges", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->databaseVerifyBlobRange, + lib, + fdbCPath, + "fdb_database_verify_blob_range", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction( &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710); - loadClientFunction( - &api->tenantPurgeBlobGranules, lib, fdbCPath, "fdb_tenant_purge_blob_granules", headerVersion >= 720); + loadClientFunction(&api->tenantPurgeBlobGranules, + lib, + fdbCPath, + "fdb_tenant_purge_blob_granules", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->tenantWaitPurgeGranulesComplete, lib, fdbCPath, "fdb_tenant_wait_purge_granules_complete", - headerVersion >= 720); + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710); loadClientFunction(&api->transactionSetOption, lib, fdbCPath, "fdb_transaction_set_option", headerVersion >= 0); @@ -737,11 +892,31 @@ void DLApi::init() { headerVersion >= 710); loadClientFunction( &api->transactionReadBlobGranules, lib, fdbCPath, "fdb_transaction_read_blob_granules", headerVersion >= 710); + loadClientFunction(&api->transactionReadBlobGranulesStart, + lib, + fdbCPath, + "fdb_transaction_read_blob_granules_start", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->transactionReadBlobGranulesFinish, + lib, + fdbCPath, + "fdb_transaction_read_blob_granules_finish", + headerVersion >= ApiVersion::withBlobRangeApi().version()); + loadClientFunction(&api->transactionSummarizeBlobGranules, + lib, + fdbCPath, + "fdb_transaction_summarize_blob_granules", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version", headerVersion >= 0); + loadClientFunction(&api->futureGetBool, + lib, + fdbCPath, + "fdb_future_get_bool", + headerVersion >= ApiVersion::withFutureGetBool().version()); loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700); loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error", headerVersion >= 0); loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key", headerVersion >= 0); @@ -754,6 +929,11 @@ void DLApi::init() { &api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0); loadClientFunction( &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 710); + loadClientFunction(&api->futureGetGranuleSummaryArray, + lib, + fdbCPath, + "fdb_future_get_granule_summary_array", + headerVersion >= ApiVersion::withBlobRangeApi().version()); loadClientFunction(&api->futureGetSharedState, lib, fdbCPath, "fdb_future_get_shared_state", headerVersion >= 710); loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0); loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0); @@ -788,6 +968,14 @@ const char* DLApi::getClientVersion() { return api->getClientVersion(); } +void DLApi::useFutureProtocolVersion() { + if (!api->useFutureProtocolVersion) { + return; + } + + api->useFutureProtocolVersion(); +} + void DLApi::setNetworkOption(FDBNetworkOptions::Option option, Optional value) { throwIfError(api->setNetworkOption(static_cast(option), value.present() ? value.get().begin() : nullptr, @@ -1069,9 +1257,10 @@ ThreadFuture>> MultiVersionTransaction::getRangeSpl } ThreadFuture>> MultiVersionTransaction::getBlobGranuleRanges( - const KeyRangeRef& keyRange) { + const KeyRangeRef& keyRange, + int rangeLimit) { auto tr = getTransaction(); - auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange) + auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange, rangeLimit) : makeTimeout>>(); return abortableFuture(f, tr.onChange); } @@ -1080,14 +1269,55 @@ ThreadResult MultiVersionTransaction::readBlobGranules(const KeyRan Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) { + // FIXME: prevent from calling this from another main thread? auto tr = getTransaction(); if (tr.transaction) { - return tr.transaction->readBlobGranules(keyRange, beginVersion, readVersion, granuleContext); + Version readVersionOut; + auto f = tr.transaction->readBlobGranulesStart(keyRange, beginVersion, readVersion, &readVersionOut); + auto abortableF = abortableFuture(f, tr.onChange); + abortableF.blockUntilReadyCheckOnMainThread(); + if (abortableF.isError()) { + return ThreadResult(abortableF.getError()); + } + if (granuleContext.debugNoMaterialize) { + return ThreadResult(blob_granule_not_materialized()); + } + return tr.transaction->readBlobGranulesFinish( + abortableF, keyRange, beginVersion, readVersionOut, granuleContext); } else { return abortableTimeoutResult(tr.onChange); } } +ThreadFuture>> MultiVersionTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { + // can't call this directly + return ThreadFuture>>(unsupported_operation()); +} + +ThreadResult MultiVersionTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { + // can't call this directly + return ThreadResult(unsupported_operation()); +} + +ThreadFuture>> MultiVersionTransaction::summarizeBlobGranules( + const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) { + auto tr = getTransaction(); + auto f = tr.transaction ? tr.transaction->summarizeBlobGranules(keyRange, summaryVersion, rangeLimit) + : makeTimeout>>(); + return abortableFuture(f, tr.onChange); +} + void MultiVersionTransaction::atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) { auto tr = getTransaction(); if (tr.transaction) { @@ -1182,7 +1412,7 @@ void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Op throw invalid_option(); } - if (MultiVersionApi::apiVersionAtLeast(610) && itr->second.persistent) { + if (MultiVersionApi::api->getApiVersion().hasPersistentOptions() && itr->second.persistent) { persistentOptions.emplace_back(option, value.castTo>()); } @@ -1579,6 +1809,32 @@ ThreadFuture MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef& return abortableFuture(f, dbState->dbVar->get().onChange); } +ThreadFuture MultiVersionDatabase::blobbifyRange(const KeyRangeRef& keyRange) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->blobbifyRange(keyRange) : ThreadFuture(Never()); + return abortableFuture(f, dbVar.onChange); +} + +ThreadFuture MultiVersionDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->unblobbifyRange(keyRange) : ThreadFuture(Never()); + return abortableFuture(f, dbVar.onChange); +} + +ThreadFuture>> MultiVersionDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->listBlobbifiedRanges(keyRange, rangeLimit) + : ThreadFuture>>(Never()); + return abortableFuture(f, dbVar.onChange); +} + +ThreadFuture MultiVersionDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->verifyBlobRange(keyRange, version) : ThreadFuture(Never()); + return abortableFuture(f, dbVar.onChange); +} + // Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected // Note: this will never return if the server is running a protocol from FDB 5.0 or older @@ -1644,7 +1900,7 @@ ThreadFuture MultiVersionDatabase::DatabaseState::monitorProtocolVersion() } ProtocolVersion clusterVersion = - !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion); + !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion()); onMainThreadVoid([self, clusterVersion]() { self->protocolVersionChanged(clusterVersion); }); return ErrorOr(Void()); }); @@ -1674,7 +1930,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion .detail("OldProtocolVersion", dbProtocolVersion); // When the protocol version changes, clear the corresponding entry in the shared state map // so it can be re-initialized. Only do so if there was a valid previous protocol version. - if (dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) { + if (dbProtocolVersion.present() && MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) { MultiVersionApi::api->clearClusterSharedStateMapEntry(clusterId, dbProtocolVersion.get()); } @@ -1703,7 +1959,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion return; } - if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) { + if (client->external && !MultiVersionApi::api->getApiVersion().hasInlineUpdateDatabase()) { // Old API versions return a future when creating the database, so we need to wait for it Reference self = Reference::addRef(this); dbReady = mapThreadFuture( @@ -1787,7 +2043,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference ne .detail("ConnectionRecord", connectionRecord); } } - if (db.isValid() && dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) { + if (db.isValid() && dbProtocolVersion.present() && + MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) { Future updateResult = MultiVersionApi::api->updateClusterSharedStateMap(connectionRecord, dbProtocolVersion.get(), db); sharedStateUpdater = map(errorOr(updateResult), [this](ErrorOr result) { @@ -1907,11 +2164,6 @@ void MultiVersionDatabase::LegacyVersionMonitor::close() { } // MultiVersionApi -bool MultiVersionApi::apiVersionAtLeast(int minVersion) { - ASSERT_NE(MultiVersionApi::api->apiVersion, 0); - return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0; -} - void MultiVersionApi::runOnExternalClientsAllThreads(std::function)> func, bool runOnFailedClients) { for (int i = 0; i < threadCount; i++) { @@ -1957,23 +2209,28 @@ Reference MultiVersionApi::getLocalClient() { } void MultiVersionApi::selectApiVersion(int apiVersion) { + ApiVersion newApiVersion(apiVersion); if (!localClient) { localClient = makeReference(getLocalClientAPI()); ASSERT(localClient); } - if (this->apiVersion != 0 && this->apiVersion != apiVersion) { + if (this->apiVersion.isValid() && this->apiVersion != newApiVersion) { throw api_version_already_set(); } localClient->api->selectApiVersion(apiVersion); - this->apiVersion = apiVersion; + this->apiVersion = newApiVersion; } const char* MultiVersionApi::getClientVersion() { return localClient->api->getClientVersion(); } +void MultiVersionApi::useFutureProtocolVersion() { + localClient->api->useFutureProtocolVersion(); +} + namespace { void validateOption(Optional value, bool canBePresent, bool canBeAbsent, bool canBeEmpty = true) { @@ -1991,7 +2248,7 @@ void validateOption(Optional value, bool canBePresent, bool canBeAbse void MultiVersionApi::disableMultiVersionClientApi() { MutexHolder holder(lock); - if (networkStartSetup || localClientDisabled) { + if (networkStartSetup || localClientDisabled || disableBypass) { throw invalid_option(); } @@ -2006,7 +2263,7 @@ void MultiVersionApi::setCallbacksOnExternalThreads() { callbackOnMainThread = false; } -void MultiVersionApi::addExternalLibrary(std::string path) { +void MultiVersionApi::addExternalLibrary(std::string path, bool useFutureVersion) { std::string filename = basename(path); if (filename.empty() || !fileExists(path)) { @@ -2023,8 +2280,8 @@ void MultiVersionApi::addExternalLibrary(std::string path) { threadCount = std::max(threadCount, 1); if (externalClientDescriptions.count(filename) == 0) { - TraceEvent("AddingExternalClient").detail("LibraryPath", filename); - externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(path, true))); + TraceEvent("AddingExternalClient").detail("LibraryPath", filename).detail("UseFutureVersion", useFutureVersion); + externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(path, true, useFutureVersion))); } } @@ -2044,7 +2301,7 @@ void MultiVersionApi::addExternalLibraryDirectory(std::string path) { std::string lib = abspath(joinPath(path, filename)); if (externalClientDescriptions.count(filename) == 0) { TraceEvent("AddingExternalClient").detail("LibraryPath", filename); - externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(lib, true))); + externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(lib, true, false))); } } } @@ -2182,7 +2439,7 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, setCallbacksOnExternalThreads(); } else if (option == FDBNetworkOptions::EXTERNAL_CLIENT_LIBRARY) { validateOption(value, true, false, false); - addExternalLibrary(abspath(value.get().toString())); + addExternalLibrary(abspath(value.get().toString()), false); } else if (option == FDBNetworkOptions::EXTERNAL_CLIENT_DIRECTORY) { validateOption(value, true, false, false); addExternalLibraryDirectory(value.get().toString()); @@ -2198,6 +2455,13 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, externalClient = true; bypassMultiClientApi = true; forwardOption = true; + } else if (option == FDBNetworkOptions::DISABLE_CLIENT_BYPASS) { + MutexHolder holder(lock); + ASSERT(!networkStartSetup); + if (bypassMultiClientApi) { + throw invalid_option(); + } + disableBypass = true; } else if (option == FDBNetworkOptions::CLIENT_THREADS_PER_VERSION) { MutexHolder holder(lock); validateOption(value, true, false, false); @@ -2213,6 +2477,21 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, } else if (option == FDBNetworkOptions::CLIENT_TMP_DIR) { validateOption(value, true, false, false); tmpDir = abspath(value.get().toString()); + } else if (option == FDBNetworkOptions::FUTURE_VERSION_CLIENT_LIBRARY) { + validateOption(value, true, false, false); + addExternalLibrary(abspath(value.get().toString()), true); + } else if (option == FDBNetworkOptions::TRACE_FILE_IDENTIFIER) { + validateOption(value, true, false, true); + traceFileIdentifier = value.get().toString(); + { + MutexHolder holder(lock); + // Forward the option unmodified only to the the local client and let it validate it. + // While for external clients the trace file identifiers are determined in setupNetwork + localClient->api->setNetworkOption(option, value); + } + } else if (option == FDBNetworkOptions::TRACE_SHARE_AMONG_CLIENT_THREADS) { + validateOption(value, false, true); + traceShareBaseNameAmongThreads = true; } else { forwardOption = true; } @@ -2251,13 +2530,18 @@ void MultiVersionApi::setupNetwork() { for (auto i : externalClientDescriptions) { std::string path = i.second.libPath; std::string filename = basename(path); + bool useFutureVersion = i.second.useFutureVersion; // Copy external lib for each thread if (externalClients.count(filename) == 0) { externalClients[filename] = {}; - for (const auto& tmp : copyExternalLibraryPerThread(path)) { + auto libCopies = copyExternalLibraryPerThread(path); + for (int idx = 0; idx < libCopies.size(); ++idx) { externalClients[filename].push_back(Reference( - new ClientInfo(new DLApi(tmp.first, tmp.second /*unlink on load*/), path))); + new ClientInfo(new DLApi(libCopies[idx].first, libCopies[idx].second /*unlink on load*/), + path, + useFutureVersion, + idx))); } } } @@ -2276,7 +2560,7 @@ void MultiVersionApi::setupNetwork() { networkStartSetup = true; - if (externalClients.empty()) { + if (externalClients.empty() && !disableBypass) { bypassMultiClientApi = true; // SOMEDAY: we won't be able to set this option once it becomes possible to add // clients after setupNetwork is called } @@ -2296,17 +2580,30 @@ void MultiVersionApi::setupNetwork() { if (!bypassMultiClientApi) { runOnExternalClientsAllThreads([this](Reference client) { TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath); - client->api->selectApiVersion(apiVersion); + client->api->selectApiVersion(apiVersion.version()); + if (client->useFutureVersion) { + client->api->useFutureProtocolVersion(); + } client->loadVersion(); }); + std::string baseTraceFileId; + if (apiVersion.hasTraceFileIdentifier()) { + // TRACE_FILE_IDENTIFIER option is supported since 6.3 + baseTraceFileId = traceFileIdentifier.empty() ? format("%d", getpid()) : traceFileIdentifier; + } + MutexHolder holder(lock); - runOnExternalClientsAllThreads([this, transportId](Reference client) { + runOnExternalClientsAllThreads([this, transportId, baseTraceFileId](Reference client) { for (auto option : options) { client->api->setNetworkOption(option.first, option.second.castTo()); } client->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID, std::to_string(transportId)); - + if (!baseTraceFileId.empty()) { + client->api->setNetworkOption( + FDBNetworkOptions::TRACE_FILE_IDENTIFIER, + traceShareBaseNameAmongThreads ? baseTraceFileId : client->getTraceFileIdentifier(baseTraceFileId)); + } client->api->setupNetwork(); }); @@ -2345,21 +2642,17 @@ void MultiVersionApi::runNetwork() { std::vector handles; if (!bypassMultiClientApi) { - for (int threadNum = 0; threadNum < threadCount; threadNum++) { - runOnExternalClients(threadNum, [&handles, threadNum](Reference client) { - if (client->external) { - std::string threadName = format("fdb-%s-%d", client->releaseVersion.c_str(), threadNum); - if (threadName.size() > 15) { - threadName = format("fdb-%s", client->releaseVersion.c_str()); - if (threadName.size() > 15) { - threadName = "fdb-external"; - } - } - handles.push_back( - g_network->startThread(&runNetworkThread, client.getPtr(), 0, threadName.c_str())); + runOnExternalClientsAllThreads([&handles](Reference client) { + ASSERT(client->external); + std::string threadName = format("fdb-%s-%d", client->releaseVersion.c_str(), client->threadIndex); + if (threadName.size() > 15) { + threadName = format("fdb-%s", client->releaseVersion.c_str()); + if (threadName.size() > 15) { + threadName = "fdb-external"; } - }); - } + } + handles.push_back(g_network->startThread(&runNetworkThread, client.getPtr(), 0, threadName.c_str())); + }); } localClient->api->runNetwork(); @@ -2472,9 +2765,9 @@ ACTOR Future updateClusterSharedStateMapImpl(MultiVersionApi* self, ProtocolVersion dbProtocolVersion, Reference db) { // The cluster ID will be the connection record string (either a filename or the connection string itself) - // in API versions before we could read the cluster ID. + // in versions before we could read the cluster ID. state std::string clusterId = connectionRecord.toString(); - if (MultiVersionApi::apiVersionAtLeast(720)) { + if (dbProtocolVersion.hasClusterIdSpecialKey()) { state Reference tr = db->createTransaction(); loop { try { @@ -2648,8 +2941,8 @@ void MultiVersionApi::loadEnvironmentVariableNetworkOptions() { MultiVersionApi::MultiVersionApi() : callbackOnMainThread(true), localClientDisabled(false), networkStartSetup(false), networkSetup(false), - bypassMultiClientApi(false), externalClient(false), apiVersion(0), threadCount(0), tmpDir("/tmp"), - envOptionsLoaded(false) {} + disableBypass(false), bypassMultiClientApi(false), externalClient(false), apiVersion(0), threadCount(0), + tmpDir("/tmp"), traceShareBaseNameAmongThreads(false), envOptionsLoaded(false) {} MultiVersionApi* MultiVersionApi::api = new MultiVersionApi(); @@ -2686,6 +2979,12 @@ bool ClientInfo::canReplace(Reference other) const { return !protocolVersion.isCompatible(other->protocolVersion); } +std::string ClientInfo::getTraceFileIdentifier(const std::string& baseIdentifier) { + std::string versionStr = releaseVersion; + std::replace(versionStr.begin(), versionStr.end(), '.', '_'); + return format("%s_v%st%d", baseIdentifier.c_str(), versionStr.c_str(), threadIndex); +} + // UNIT TESTS TEST_CASE("/fdbclient/multiversionclient/EnvironmentVariableParsing") { auto vals = parseOptionValues("a"); @@ -2971,7 +3270,7 @@ struct AbortableTest { } }; -TEST_CASE("/fdbclient/multiversionclient/AbortableSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/AbortableSingleAssignmentVar") { state volatile bool done = false; state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest, (void*)&done); @@ -3048,7 +3347,7 @@ struct DLTest { } }; -TEST_CASE("/fdbclient/multiversionclient/DLSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/DLSingleAssignmentVar") { state volatile bool done = false; MultiVersionApi::api->callbackOnMainThread = true; @@ -3092,7 +3391,7 @@ struct MapTest { } }; -TEST_CASE("/fdbclient/multiversionclient/MapSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/MapSingleAssignmentVar") { state volatile bool done = false; state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest, (void*)&done); @@ -3131,7 +3430,7 @@ struct FlatMapTest { } }; -TEST_CASE("/fdbclient/multiversionclient/FlatMapSingleAssignmentVar") { +TEST_CASE("fdbclient/multiversionclient/FlatMapSingleAssignmentVar") { state volatile bool done = false; state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest, (void*)&done); diff --git a/fdbclient/MutationLogReader.actor.cpp b/fdbclient/MutationLogReader.actor.cpp index 5919fdc66b..d6e3adc8dd 100644 --- a/fdbclient/MutationLogReader.actor.cpp +++ b/fdbclient/MutationLogReader.actor.cpp @@ -67,7 +67,7 @@ ACTOR Future PipelinedReader::getNext_impl(PipelinedReader* self, Database state Transaction tr(cx); state GetRangeLimits limits(GetRangeLimits::ROW_LIMIT_UNLIMITED, - (g_network->isSimulated() && !g_simulator.speedUpSimulation) + (g_network->isSimulated() && !g_simulator->speedUpSimulation) ? CLIENT_KNOBS->BACKUP_SIMULATED_LIMIT_BYTES : CLIENT_KNOBS->BACKUP_GET_RANGE_LIMIT_BYTES); @@ -179,7 +179,7 @@ ACTOR Future> MutationLogReader::getNext_impl(Mutatio namespace { // UNIT TESTS TEST_CASE("/fdbclient/mutationlogreader/VersionKeyRefConversion") { - Key prefix = LiteralStringRef("foos"); + Key prefix = "foos"_sr; ASSERT(keyRefToVersion(versionToKey(0, prefix), prefix.size()) == 0); ASSERT(keyRefToVersion(versionToKey(1, prefix), prefix.size()) == 1); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 9c2e3cb153..c390020b64 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/ClusterConnectionFile.h" +#include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/GlobalConfig.actor.h" @@ -102,6 +104,8 @@ #endif #include "flow/actorcompiler.h" // This must be the last #include. +FDB_DEFINE_BOOLEAN_PARAM(CacheResult); + extern const char* getSourceVersion(); namespace { @@ -152,8 +156,8 @@ NetworkOptions::NetworkOptions() supportedVersions(new ReferencedObject>>()), runLoopProfilingEnabled(false), primaryClient(true) {} -static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); -static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); +static const Key CLIENT_LATENCY_INFO_PREFIX = "client_latency/"_sr; +static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = "client_latency_counter/"_sr; void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { auto result = tssMapping.find(ssi.id()); @@ -167,14 +171,8 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe tssMetrics[tssi.id()] = metrics; tssMapping[ssi.id()] = tssi; } else { - if (result->second.id() == tssi.id()) { - metrics = tssMetrics[tssi.id()]; - } else { - TEST(true); // SS now maps to new TSS! This will probably never happen in practice - tssMetrics.erase(result->second.id()); - metrics = makeReference(); - tssMetrics[tssi.id()] = metrics; - } + ASSERT(result->second.id() == tssi.id()); + metrics = tssMetrics[tssi.id()]; result->second = tssi; } @@ -230,8 +228,9 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc VersionVector& latestCommitVersions) { latestCommitVersions.clear(); - if (info->debugID.present()) { - g_traceBatch.addEvent("TransactionDebug", info->debugID.get().first(), "NativeAPI.getLatestCommitVersions"); + if (info->readOptions.present() && info->readOptions.get().debugID.present()) { + g_traceBatch.addEvent( + "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions"); } if (!info->readVersionObtainedFromGrvProxy) { @@ -242,7 +241,7 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc return; } - if (ssVersionVectorCache.getMaxVersion() != invalidVersion && readVersion > ssVersionVectorCache.getMaxVersion()) { + if (readVersion > ssVersionVectorCache.getMaxVersion()) { if (!CLIENT_KNOBS->FORCE_GRV_CACHE_OFF && !info->options.skipGrvCache && info->options.useGrvCache) { return; } else { @@ -255,16 +254,32 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc std::map> versionMap; // order the versions to be returned for (int i = 0; i < locationInfo->locations()->size(); i++) { - UID uid = locationInfo->locations()->getId(i); - if (ssidTagMapping.find(uid) != ssidTagMapping.end()) { - Tag tag = ssidTagMapping[uid]; + bool updatedVersionMap = false; + Version commitVersion = invalidVersion; + Tag tag = invalidTag; + auto iter = ssidTagMapping.find(locationInfo->locations()->getId(i)); + if (iter != ssidTagMapping.end()) { + tag = iter->second; if (ssVersionVectorCache.hasVersion(tag)) { - Version commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version + commitVersion = ssVersionVectorCache.getVersion(tag); // latest commit version if (commitVersion < readVersion) { + updatedVersionMap = true; versionMap[commitVersion].insert(tag); } } } + // Do not log if commitVersion >= readVersion. + if (!updatedVersionMap && commitVersion == invalidVersion) { + TraceEvent(SevDebug, "CommitVersionNotFoundForSS") + .detail("InSSIDMap", iter != ssidTagMapping.end() ? 1 : 0) + .detail("Tag", tag) + .detail("CommitVersion", commitVersion) + .detail("ReadVersion", readVersion) + .detail("VersionVector", ssVersionVectorCache.toString()) + .setMaxEventLength(11000) + .setMaxFieldLength(10000); + ++transactionCommitVersionNotFoundForSS; + } } // insert the commit versions in the version vector. @@ -444,7 +459,7 @@ void DatabaseContext::validateVersion(Version version) const { throw client_invalid_operation(); } if (switchable && version < minAcceptableReadVersion) { - TEST(true); // Attempted to read a version lower than any this client has seen from the current cluster + CODE_PROBE(true, "Attempted to read a version lower than any this client has seen from the current cluster"); throw transaction_too_old(); } @@ -584,7 +599,8 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { loop { wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); - if (!g_network->isSimulated()) { + bool logTraces = !g_network->isSimulated() || BUGGIFY_WITH_PROB(0.01); + if (logTraces) { TraceEvent ev("TransactionMetrics", cx->dbId); ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) @@ -637,6 +653,19 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { cx->bgLatencies.clear(); cx->bgGranulesPerRequest.clear(); + if (cx->usedAnyChangeFeeds && logTraces) { + TraceEvent feedEv("ChangeFeedClientMetrics", cx->dbId); + + feedEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) + .detail("Cluster", + cx->getConnectionRecord() + ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() + : "") + .detail("Internal", cx->internal); + + cx->ccFeed.logToTraceEvent(feedEv); + } + lastLogged = now(); } } @@ -710,6 +739,7 @@ ACTOR static Future delExcessClntTxnEntriesActor(Transaction* tr, int64_t tr->clear(KeyRangeRef(txEntries[0].key, strinc(endKey))); TraceEvent(SevInfo, "DeletingExcessCntTxnEntries").detail("BytesToBeDeleted", numBytesToDel); int64_t bytesDel = -numBytesToDel; + tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&bytesDel, 8), MutationRef::AddValue); wait(tr->commit()); } @@ -1114,8 +1144,8 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH; TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch") .detail("TSSID", data.first.toString()); - TEST(quarantine); // Quarantining TSS because it got mismatch - TEST(!quarantine); // Killing TSS because it got mismatch + CODE_PROBE(quarantine, "Quarantining TSS because it got mismatch"); + CODE_PROBE(!quarantine, "Killing TSS because it got mismatch"); tr = makeReference(Database(Reference::addRef(cx))); state int tries = 0; @@ -1124,7 +1154,7 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); if (quarantine) { - tr->set(tssQuarantineKeyFor(data.first), LiteralStringRef("")); + tr->set(tssQuarantineKeyFor(data.first), ""_sr); } else { tr->clear(serverTagKeyFor(data.first)); } @@ -1132,10 +1162,9 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { for (const DetailedTSSMismatch& d : data.second) { // -> mismatch data - tssMismatchDB.set( - tr, - Tuple().append(data.first.toString()).append(d.timestamp).append(d.mismatchId.toString()), - d.traceString); + tssMismatchDB.set(tr, + Tuple::makeTuple(data.first.toString(), d.timestamp, d.mismatchId.toString()), + d.traceString); } wait(tr->commit()); @@ -1154,7 +1183,7 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { // clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx tr = makeReference(); } else { - TEST(true); // Not handling TSS with mismatch because it's already gone + CODE_PROBE(true, "Not handling TSS with mismatch because it's already gone"); } } } @@ -1253,7 +1282,7 @@ void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, std::unique_ptr&& impl, int deprecatedVersion) { // if deprecated, add the implementation when the api version is less than the deprecated version - if (deprecatedVersion == -1 || apiVersion < deprecatedVersion) { + if (deprecatedVersion == -1 || apiVersion.version() < deprecatedVersion) { specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get()); specialKeySpaceModules.push_back(std::move(impl)); } @@ -1262,32 +1291,6 @@ void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, ACTOR Future getWorkerInterfaces(Reference clusterRecord); ACTOR Future> getJSON(Database db); -struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl { - Future getRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) const override { - if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { - Key prefix = Key(getKeyRange().begin); - return map(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()), - [prefix = prefix, kr = KeyRange(kr)](const RangeResult& in) { - RangeResult result; - for (const auto& [k_, v] : in) { - auto k = k_.withPrefix(prefix); - if (kr.contains(k)) - result.push_back_deep(result.arena(), KeyValueRef(k, v)); - } - - std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{}); - return result; - }); - } else { - return RangeResult(); - } - } - - explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} -}; - struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, @@ -1302,12 +1305,17 @@ struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { }); } - SingleSpecialKeyImpl(KeyRef k, const std::function>(ReadYourWritesTransaction*)>& f) - : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f) {} + SingleSpecialKeyImpl(KeyRef k, + const std::function>(ReadYourWritesTransaction*)>& f, + bool supportsTenants = false) + : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f), tenantSupport(supportsTenants) {} + + bool supportsTenants() const override { return tenantSupport; }; private: Key k; std::function>(ReadYourWritesTransaction*)> f; + bool tenantSupport; }; class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl { @@ -1322,7 +1330,7 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange RangeResult result; if (CLIENT_BUGGIFY) return result; - if (kr.contains(LiteralStringRef("\xff\xff/metrics/health/aggregate")) && metrics.worstStorageDurabilityLag != 0) { + if (kr.contains("\xff\xff/metrics/health/aggregate"_sr) && metrics.worstStorageDurabilityLag != 0) { json_spirit::mObject statsObj; statsObj["batch_limited"] = metrics.batchLimited; statsObj["tps_limit"] = metrics.tpsLimit; @@ -1334,15 +1342,13 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); - result.push_back(result.arena(), KeyValueRef(LiteralStringRef("\xff\xff/metrics/health/aggregate"), bytes)); + result.push_back(result.arena(), KeyValueRef("\xff\xff/metrics/health/aggregate"_sr, bytes)); } // tlog stats { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, logStats] : metrics.tLogQueue) { - StringRef k{ - StringRef(uid.toString()).withPrefix(LiteralStringRef("\xff\xff/metrics/health/log/"), result.arena()) - }; + StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/log/"_sr, result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } @@ -1364,8 +1370,7 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, storageStats] : metrics.storageStats) { - StringRef k{ StringRef(uid.toString()) - .withPrefix(LiteralStringRef("\xff\xff/metrics/health/storage/"), result.arena()) }; + StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/storage/"_sr, result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } @@ -1391,10 +1396,9 @@ static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRange ACTOR static Future healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) { HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics( - /*detailed ("per process")*/ kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/storage/"), - LiteralStringRef("\xff\xff/metrics/health/storage0"))) || - kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/log/"), - LiteralStringRef("\xff\xff/metrics/health/log0"))))); + /*detailed ("per process")*/ kr.intersects( + KeyRangeRef("\xff\xff/metrics/health/storage/"_sr, "\xff\xff/metrics/health/storage0"_sr)) || + kr.intersects(KeyRangeRef("\xff\xff/metrics/health/log/"_sr, "\xff\xff/metrics/health/log0"_sr)))); return healthMetricsToKVPairs(metrics, kr); } @@ -1432,7 +1436,7 @@ DatabaseContext::DatabaseContext(Reference defaultTenant) : lockAware(lockAware), switchable(switchable), connectionRecord(connectionRecord), proxyProvisional(false), @@ -1467,16 +1471,24 @@ DatabaseContext::DatabaseContext(ReferenceSHARD_STAT_SMOOTH_AMOUNT), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc), + bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), + feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), + feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), + feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), + commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), + bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0), + lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), + transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), + coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), + detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { + dbId = deterministicRandom()->randomUniqueID(); + + TraceEvent("DatabaseContextCreated", dbId).backtrace(); + connected = (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) ? Void() : clientInfo->onChange(); @@ -1484,7 +1496,7 @@ DatabaseContext::DatabaseContext(ReferenceMETADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; - snapshotRywEnabled = apiVersionAtLeast(300) ? 1 : 0; + snapshotRywEnabled = apiVersion.hasSnapshotRYW() ? 1 : 0; logger = databaseLogger(this) && tssLogger(this); locationCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM @@ -1492,8 +1504,8 @@ DatabaseContext::DatabaseContext(ReferenceisSimulated() ? CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE_SIM : CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE; - getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted")); - getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted")); + getValueSubmitted.init("NativeAPI.GetValueSubmitted"_sr); + getValueCompleted.init("NativeAPI.GetValueCompleted"_sr); clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger); tssMismatchHandler = handleTssMismatches(this); @@ -1503,34 +1515,7 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); globalConfig = std::make_unique(this); - if (apiVersionAtLeast(720)) { - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::CLUSTERID, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique( - LiteralStringRef("\xff\xff/cluster_id"), [](ReadYourWritesTransaction* ryw) -> Future> { - try { - if (ryw->getDatabase().getPtr()) { - return map(getClusterId(ryw->getDatabase()), - [](UID id) { return Optional(StringRef(id.toString())); }); - } - } catch (Error& e) { - return e; - } - return Optional(); - })); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique>(SpecialKeySpace::getManagementApiCommandRange("tenant"))); - } - if (apiVersionAtLeast(710) && !apiVersionAtLeast(720)) { - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::MANAGEMENT, - SpecialKeySpace::IMPLTYPE::READWRITE, - std::make_unique>(SpecialKeySpace::getManagementApiCommandRange("tenantmap"))); - } - if (apiVersionAtLeast(700)) { + if (apiVersion.version() >= 700) { registerSpecialKeysImpl(SpecialKeySpace::MODULE::ERRORMSG, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( @@ -1540,12 +1525,13 @@ DatabaseContext::DatabaseContext(Reference(ryw->getSpecialKeySpaceErrorMsg().get()); else return Optional(); - })); + }, + true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("options/"), LiteralStringRef("options0")) + KeyRangeRef("options/"_sr, "options0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, @@ -1567,31 +1553,31 @@ DatabaseContext::DatabaseContext(Reference( - KeyRangeRef(LiteralStringRef("in_progress_exclusion/"), LiteralStringRef("in_progress_exclusion0")) + KeyRangeRef("in_progress_exclusion/"_sr, "in_progress_exclusion0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("process/class_type/"), LiteralStringRef("process/class_type0")) + KeyRangeRef("process/class_type/"_sr, "process/class_type0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - KeyRangeRef(LiteralStringRef("process/class_source/"), LiteralStringRef("process/class_source0")) + KeyRangeRef("process/class_source/"_sr, "process/class_source0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("db_locked")) + singleKeyRange("db_locked"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("consistency_check_suspended")) + singleKeyRange("consistency_check_suspended"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::GLOBALCONFIG, @@ -1605,44 +1591,44 @@ DatabaseContext::DatabaseContext(Reference( - KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0")) + KeyRangeRef("coordinators/"_sr, "coordinators0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - singleKeyRange(LiteralStringRef("auto_coordinators")) + singleKeyRange("auto_coordinators"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("min_required_commit_version")) + singleKeyRange("min_required_commit_version"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - singleKeyRange(LiteralStringRef("version_epoch")) + singleKeyRange("version_epoch"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) + KeyRangeRef("profiling/"_sr, "profiling0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)), /* deprecated */ 720); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0")) + KeyRangeRef("maintenance/"_sr, "maintenance0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( - KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) + KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::ACTORLINEAGE, @@ -1653,7 +1639,7 @@ DatabaseContext::DatabaseContext(Reference( SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF))); } - if (apiVersionAtLeast(630)) { + if (apiVersion.version() >= 630) { registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(conflictingKeysRange)); @@ -1666,20 +1652,18 @@ DatabaseContext::DatabaseContext(Reference(ddStatsRange)); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::METRICS, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/"), - LiteralStringRef("\xff\xff/metrics/health0")))); - registerSpecialKeysImpl( - SpecialKeySpace::MODULE::WORKERINTERFACE, - SpecialKeySpace::IMPLTYPE::READONLY, - std::make_unique(KeyRangeRef( - LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("\xff\xff/metrics/health/"_sr, "\xff\xff/metrics/health0"_sr))); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::WORKERINTERFACE, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::STATUSJSON, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - LiteralStringRef("\xff\xff/status/json"), + "\xff\xff/status/json"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { ++ryw->getDatabase()->transactionStatusRequests; @@ -1687,11 +1671,12 @@ DatabaseContext::DatabaseContext(Reference(); } - })); + }, + true)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERFILEPATH, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - LiteralStringRef("\xff\xff/cluster_file_path"), + "\xff\xff/cluster_file_path"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && @@ -1704,13 +1689,14 @@ DatabaseContext::DatabaseContext(Reference(); - })); + }, + true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONNECTIONSTRING, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( - LiteralStringRef("\xff\xff/connection_string"), + "\xff\xff/connection_string"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { @@ -1722,7 +1708,30 @@ DatabaseContext::DatabaseContext(Reference(); - })); + }, + true)); + registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERID, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique( + "\xff\xff/cluster_id"_sr, + [](ReadYourWritesTransaction* ryw) -> Future> { + try { + if (ryw->getDatabase().getPtr()) { + return map(getClusterId(ryw->getDatabase()), [](UID id) { + return Optional(StringRef(id.toString())); + }); + } + } catch (Error& e) { + return e; + } + return Optional(); + }, + true)); + + registerSpecialKeysImpl( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenant"))); } throttleExpirer = recurring([this]() { expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL); @@ -1762,8 +1771,13 @@ DatabaseContext::DatabaseContext(const Error& err) transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), - latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), - bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false), + transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), bgReadInputBytes("BGReadInputBytes", cc), + bgReadOutputBytes("BGReadOutputBytes", cc), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), + feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), + feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), + feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(1000), readLatencies(1000), + commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), + bgGranulesPerRequest(1000), sharedStatePtr(nullptr), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} @@ -1805,9 +1819,17 @@ DatabaseContext::~DatabaseContext() { it->second->notifyContextDestroyed(); ASSERT_ABORT(server_interf.empty()); locationCache.insert(allKeys, Reference()); + for (auto& it : notAtLatestChangeFeeds) { + it.second->context = nullptr; + } + for (auto& it : changeFeedUpdaters) { + it.second->context = nullptr; + } + + TraceEvent("DatabaseContextDestructed", dbId).backtrace(); } -Optional DatabaseContext::getCachedLocation(const Optional& tenantName, +Optional DatabaseContext::getCachedLocation(const Optional& tenantName, const KeyRef& key, Reverse isBackward) { TenantMapEntry tenantEntry; @@ -1833,7 +1855,7 @@ Optional DatabaseContext::getCachedLocation(const Optional return Optional(); } -bool DatabaseContext::getCachedLocations(const Optional& tenantName, +bool DatabaseContext::getCachedLocations(const Optional& tenantName, const KeyRangeRef& range, std::vector& result, int limit, @@ -1860,7 +1882,7 @@ bool DatabaseContext::getCachedLocations(const Optional& tenantName, loop { auto r = reverse ? end : begin; if (!r->value()) { - TEST(result.size()); // had some but not all cached locations + CODE_PROBE(result.size(), "had some but not all cached locations"); result.clear(); return false; } @@ -1890,7 +1912,7 @@ void DatabaseContext::cacheTenant(const TenantName& tenant, const TenantMapEntry } } -Reference DatabaseContext::setCachedLocation(const Optional& tenant, +Reference DatabaseContext::setCachedLocation(const Optional& tenant, const TenantMapEntry& tenantEntry, const KeyRangeRef& absoluteKeys, const std::vector& servers) { @@ -1907,7 +1929,7 @@ Reference DatabaseContext::setCachedLocation(const Optional(serverRefs); while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) { - TEST(true); // NativeAPI storage server locationCache entry evicted + CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted"); attempts++; auto r = locationCache.randomRange(); Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it @@ -2091,7 +2113,7 @@ Future DatabaseContext::onConnected() { ACTOR static Future switchConnectionRecordImpl(Reference connRecord, DatabaseContext* self) { - TEST(true); // Switch connection file + CODE_PROBE(true, "Switch connection file"); TraceEvent("SwitchConnectionRecord") .detail("ClusterFile", connRecord->toString()) .detail("ConnectionString", connRecord->getConnectionString().toString()); @@ -2152,7 +2174,7 @@ void DatabaseContext::expireThrottles() { for (auto& priorityItr : throttledTags) { for (auto tagItr = priorityItr.second.begin(); tagItr != priorityItr.second.end();) { if (tagItr->second.expired()) { - TEST(true); // Expiring client throttle + CODE_PROBE(true, "Expiring client throttle"); tagItr = priorityItr.second.erase(tagItr); } else { ++tagItr; @@ -2285,6 +2307,13 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } +Database Database::createSimulatedExtraDatabase(std::string connectionString, Optional defaultTenant) { + auto extraFile = makeReference(ClusterConnectionString(connectionString)); + Database db = Database::createDatabase(extraFile, ApiVersion::LATEST_VERSION); + db->defaultTenant = defaultTenant; + return db; +} + Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { const auto it = watchMap.find(std::make_pair(tenantId, key)); if (it == watchMap.end()) @@ -2458,7 +2487,7 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional valu ASSERT(value.present()); Standalone> supportedVersions; - std::vector supportedVersionsStrings = value.get().splitAny(LiteralStringRef(";")); + std::vector supportedVersionsStrings = value.get().splitAny(";"_sr); for (StringRef versionString : supportedVersionsStrings) { #ifdef ADDRESS_SANITIZER __lsan_disable(); @@ -2638,7 +2667,7 @@ bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const { if (proxy.id() == proxyId) return true; } - TEST(true); // stale GRV proxy detected + CODE_PROBE(true, "stale GRV proxy detected"); return false; } @@ -2831,7 +2860,7 @@ void updateTagMappings(Database cx, const GetKeyServerLocationsReply& reply) { // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). // Otherwise returns the shard containing key ACTOR Future getKeyLocation_internal(Database cx, - Optional tenant, + TenantInfo tenant, Key key, SpanContext spanContext, Optional debugID, @@ -2854,27 +2883,22 @@ ACTOR Future getKeyLocation_internal(Database cx, ++cx->transactionKeyServerLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} - when(GetKeyServerLocationsReply rep = - wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), - &CommitProxyInterface::getKeyServersLocations, - GetKeyServerLocationsRequest(span.context, - tenant.castTo(), - key, - Optional(), - 100, - isBackward, - version, - key.arena()), - TaskPriority::DefaultPromiseEndpoint))) { + when(GetKeyServerLocationsReply rep = wait(basicLoadBalance( + cx->getCommitProxies(useProvisionalProxies), + &CommitProxyInterface::getKeyServersLocations, + GetKeyServerLocationsRequest( + span.context, tenant, key, Optional(), 100, isBackward, version, key.arena()), + TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; if (debugID.present()) g_traceBatch.addEvent( "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.After"); ASSERT(rep.results.size() == 1); - auto locationInfo = - cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second); + auto locationInfo = cx->setCachedLocation( + tenant.name, rep.tenantEntry, rep.results[0].first, rep.results[0].second); updateTssMappings(cx, rep); + updateTagMappings(cx, rep); return KeyRangeLocationInfo( rep.tenantEntry, @@ -2885,8 +2909,8 @@ ACTOR Future getKeyLocation_internal(Database cx, } } catch (Error& e) { if (e.code() == error_code_tenant_not_found) { - ASSERT(tenant.present()); - cx->invalidateCachedTenant(tenant.get()); + ASSERT(tenant.name.present()); + cx->invalidateCachedTenant(tenant.name.get()); } throw; @@ -2924,7 +2948,7 @@ bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) { template Future getKeyLocation(Database const& cx, - Optional const& tenant, + TenantInfo const& tenant, Key const& key, F StorageServerInterface::*member, SpanContext spanContext, @@ -2933,7 +2957,7 @@ Future getKeyLocation(Database const& cx, Reverse isBackward, Version version) { // we first check whether this range is cached - Optional locationInfo = cx->getCachedLocation(tenant, key, isBackward); + Optional locationInfo = cx->getCachedLocation(tenant.name, key, isBackward); if (!locationInfo.present()) { return getKeyLocation_internal( cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); @@ -2965,18 +2989,18 @@ Future getKeyLocation(Reference trState, UseTenant useTenant, Version version) { auto f = getKeyLocation(trState->cx, - useTenant ? trState->tenant() : Optional(), + useTenant ? trState->getTenantInfo(AllowInvalidTenantID::True) : TenantInfo(), key, member, trState->spanContext, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, isBackward, version); - if (trState->tenant().present() && useTenant) { + if (trState->tenant().present() && useTenant && trState->tenantId() == TenantInfo::INVALID_TENANT) { return map(f, [trState](const KeyRangeLocationInfo& locationInfo) { - trState->tenantId = locationInfo.tenantEntry.id; + trState->trySetTenantId(locationInfo.tenantEntry.id); return locationInfo; }); } else { @@ -2986,7 +3010,7 @@ Future getKeyLocation(Reference trState, ACTOR Future> getKeyRangeLocations_internal( Database cx, - Optional tenant, + TenantInfo tenant, KeyRange keys, int limit, Reverse reverse, @@ -3003,18 +3027,12 @@ ACTOR Future> getKeyRangeLocations_internal( ++cx->transactionKeyServerLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} - when(GetKeyServerLocationsReply _rep = - wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), - &CommitProxyInterface::getKeyServersLocations, - GetKeyServerLocationsRequest(span.context, - tenant.castTo(), - keys.begin, - keys.end, - limit, - reverse, - version, - keys.arena()), - TaskPriority::DefaultPromiseEndpoint))) { + when(GetKeyServerLocationsReply _rep = wait(basicLoadBalance( + cx->getCommitProxies(useProvisionalProxies), + &CommitProxyInterface::getKeyServersLocations, + GetKeyServerLocationsRequest( + span.context, tenant, keys.begin, keys.end, limit, reverse, version, keys.arena()), + TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; state GetKeyServerLocationsReply rep = _rep; if (debugID.present()) @@ -3031,7 +3049,7 @@ ACTOR Future> getKeyRangeLocations_internal( rep.tenantEntry, (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), cx->setCachedLocation( - tenant, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); + tenant.name, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); wait(yield()); } updateTssMappings(cx, rep); @@ -3043,8 +3061,8 @@ ACTOR Future> getKeyRangeLocations_internal( } } catch (Error& e) { if (e.code() == error_code_tenant_not_found) { - ASSERT(tenant.present()); - cx->invalidateCachedTenant(tenant.get()); + ASSERT(tenant.name.present()); + cx->invalidateCachedTenant(tenant.name.get()); } throw; @@ -3059,7 +3077,7 @@ ACTOR Future> getKeyRangeLocations_internal( // [([a, b1), locationInfo), ([b1, c), locationInfo), ([c, d1), locationInfo)]. template Future> getKeyRangeLocations(Database const& cx, - Optional tenant, + TenantInfo const& tenant, KeyRange const& keys, int limit, Reverse reverse, @@ -3072,7 +3090,7 @@ Future> getKeyRangeLocations(Database const& c ASSERT(!keys.empty()); std::vector locations; - if (!cx->getCachedLocations(tenant, keys, locations, limit, reverse)) { + if (!cx->getCachedLocations(tenant.name, keys, locations, limit, reverse)) { return getKeyRangeLocations_internal( cx, tenant, keys, limit, reverse, spanContext, debugID, useProvisionalProxies, version); } @@ -3110,20 +3128,20 @@ Future> getKeyRangeLocations(Referencecx, - useTenant ? trState->tenant() : Optional(), + useTenant ? trState->getTenantInfo(AllowInvalidTenantID::True) : TenantInfo(), keys, limit, reverse, member, trState->spanContext, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, version); - if (trState->tenant().present() && useTenant) { + if (trState->tenant().present() && useTenant && trState->tenantId() == TenantInfo::INVALID_TENANT) { return map(f, [trState](const std::vector& locationInfo) { ASSERT(!locationInfo.empty()); - trState->tenantId = locationInfo[0].tenantEntry.id; + trState->trySetTenantId(locationInfo[0].tenantEntry.id); return locationInfo; }); } else { @@ -3138,16 +3156,16 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange state Version version = wait(fVersion); loop { - std::vector locations = - wait(getKeyRangeLocations_internal(trState->cx, - trState->tenant(), - keys, - CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, - Reverse::False, - trState->spanContext, - trState->debugID, - trState->useProvisionalProxies, - version)); + std::vector locations = wait(getKeyRangeLocations_internal( + trState->cx, + trState->getTenantInfo(), + keys, + CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, + Reverse::False, + trState->spanContext, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), + trState->useProvisionalProxies, + version)); totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || @@ -3190,6 +3208,8 @@ SpanContext generateSpanID(bool transactionTracingSample, SpanContext parentCont deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUInt64(), TraceFlags::unsampled); } +FDB_DEFINE_BOOLEAN_PARAM(AllowInvalidTenantID); + TransactionState::TransactionState(Database cx, Optional tenant, TaskPriority taskID, @@ -3213,28 +3233,44 @@ Reference TransactionState::cloneAndReset(ReferencestartTime = startTime; newState->committedVersion = committedVersion; newState->conflictingKeys = conflictingKeys; + newState->authToken = authToken; newState->tenantSet = tenantSet; return newState; } -TenantInfo TransactionState::getTenantInfo() { +TenantInfo TransactionState::getTenantInfo(AllowInvalidTenantID allowInvalidId /* = false */) { Optional const& t = tenant(); if (options.rawAccess) { return TenantInfo(); + } else if (!cx->internal && cx->clientInfo->get().clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + throw management_cluster_invalid_access(); } else if (!cx->internal && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !t.present()) { throw tenant_name_required(); } else if (!t.present()) { return TenantInfo(); } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) { - throw tenants_disabled(); + // If we are running provisional proxies, we allow a tenant request to go through since we don't know the tenant + // mode. Such a transaction would not be allowed to commit without enabling provisional commits because either + // the commit proxies will be provisional or the read version will be too old. + if (!cx->clientInfo->get().grvProxies.empty() && !cx->clientInfo->get().grvProxies[0].provisional) { + throw tenants_disabled(); + } else { + ASSERT(!useProvisionalProxies); + } } - ASSERT(tenantId != TenantInfo::INVALID_TENANT); - return TenantInfo(t.get(), tenantId); + ASSERT(allowInvalidId || tenantId_ != TenantInfo::INVALID_TENANT); + return TenantInfo(t, authToken, tenantId_); } +// Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default +// tenant from DatabaseContext is applied to this transaction (note: the default tenant is typically unset, but in +// simulation could be something different). +// +// This function should not be called in the transaction constructor or in the setOption function to allow a user the +// opportunity to set raw access. Optional const& TransactionState::tenant() { if (tenantSet) { return tenant_; @@ -3247,10 +3283,20 @@ Optional const& TransactionState::tenant() { } } +// Returns true if the tenant has been set, but does not cause default tenant resolution. This is useful in setOption +// (where we do not want to call tenant()) if we want to enforce that an option not be set on a Tenant transaction (e.g. +// for raw access). bool TransactionState::hasTenant() const { return tenantSet && tenant_.present(); } +Future TransactionState::handleUnknownTenant() { + tenantId_ = TenantInfo::INVALID_TENANT; + ASSERT(tenant().present()); + cx->invalidateCachedTenant(tenant().get()); + return delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, taskID); +} + Future Transaction::warmRange(KeyRange keys) { return warmRange_impl(trState, keys, getReadVersion()); } @@ -3277,12 +3323,16 @@ ACTOR Future> getValue(Reference trState, state uint64_t startTime; state double startTimeD; state VersionVector ssLatestCommitVersions; + state Optional readOptions = trState->readOptions; + trState->cx->getLatestCommitVersions(locationInfo.locations, ver, trState, ssLatestCommitVersions); try { - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { getValueID = nondeterministicRandom()->randomUniqueID(); + readOptions.get().debugID = getValueID; - g_traceBatch.addAttach("GetValueAttachID", trState->debugID.get().first(), getValueID.get().first()); + g_traceBatch.addAttach( + "GetValueAttachID", trState->readOptions.get().debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); @@ -3315,7 +3365,7 @@ ACTOR Future> getValue(Reference trState, ver, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), - getValueID, + readOptions, ssLatestCommitVersions), TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, @@ -3369,9 +3419,8 @@ ACTOR Future> getValue(Reference trState, trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); } else { if (trState->trLogInfo && recordLogInfo) trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, @@ -3391,12 +3440,16 @@ ACTOR Future getKey(Reference trState, UseTenant useTenant = UseTenant::True) { wait(success(version)); - state Optional getKeyID = Optional(); - state Span span("NAPI:getKey"_loc, trState->spanContext); - if (trState->debugID.present()) { - getKeyID = nondeterministicRandom()->randomUniqueID(); + state Optional getKeyID; + state Optional readOptions = trState->readOptions; - g_traceBatch.addAttach("GetKeyAttachID", trState->debugID.get().first(), getKeyID.get().first()); + state Span span("NAPI:getKey"_loc, trState->spanContext); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + getKeyID = nondeterministicRandom()->randomUniqueID(); + readOptions.get().debugID = getKeyID; + + g_traceBatch.addAttach( + "GetKeyAttachID", trState->readOptions.get().debugID.get().first(), getKeyID.get().first()); g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), @@ -3439,7 +3492,7 @@ ACTOR Future getKey(Reference trState, k, version.get(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), - getKeyID, + readOptions, ssLatestCommitVersions); req.arena.dependsOn(k.arena()); @@ -3480,9 +3533,8 @@ ACTOR Future getKey(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); throw e; @@ -3493,8 +3545,8 @@ ACTOR Future getKey(Reference trState, ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanContext spanContext) { state Span span("NAPI:waitForCommittedVersion"_loc, spanContext); - try { - loop { + loop { + try { choose { when(wait(cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance( @@ -3520,10 +3572,16 @@ ACTOR Future waitForCommittedVersion(Database cx, Version version, Span wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID)); } } + } catch (Error& e) { + if (e.code() == error_code_batch_transaction_throttled || + e.code() == error_code_grv_proxy_memory_limit_exceeded) { + // GRV Proxy returns an error + wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); + } else { + TraceEvent(SevError, "WaitForCommittedVersionError").error(e); + throw; + } } - } catch (Error& e) { - TraceEvent(SevError, "WaitForCommittedVersionError").error(e); - throw; } } @@ -3566,7 +3624,7 @@ ACTOR Future watchValue(Database cx, Reference p loop { state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(cx, - parameters->tenant.name, + parameters->tenant, parameters->key, &StorageServerInterface::watchValue, parameters->spanContext, @@ -3629,13 +3687,13 @@ ACTOR Future watchValue(Database cx, Reference p wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, parameters->taskID)); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off - TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead - TEST(e.code() == error_code_process_behind); // The storage servers are all behind + CODE_PROBE(e.code() == error_code_watch_cancelled, "Too many watches on the storage server, poll for changes instead"); + CODE_PROBE(e.code() == error_code_process_behind, "The storage servers are all behind"); // clang-format on wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, parameters->taskID)); } else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case // it was cancelled - TEST(true); // A watch timed out + CODE_PROBE(true, "A watch timed out"); wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID)); } else { state Error err = e; @@ -3667,7 +3725,8 @@ ACTOR Future watchStorageServerResp(int64_t tenantId, Key key, Database cx } // ABA happens else { - TEST(true); // ABA issue where the version returned from the server is less than the version in the map + CODE_PROBE(true, + "ABA issue where the version returned from the server is less than the version in the map"); // case 2: version_1 < version_2 and future_count == 1 if (metadata->watchPromise.getFutureReferenceCount() == 1) { @@ -3696,7 +3755,7 @@ ACTOR Future watchStorageServerResp(int64_t tenantId, Key key, Database cx } ACTOR Future sameVersionDiffValue(Database cx, Reference parameters) { - state ReadYourWritesTransaction tr(cx, parameters->tenant.name); + state ReadYourWritesTransaction tr(cx, parameters->tenant.name.castTo()); loop { try { if (!parameters->tenant.name.present()) { @@ -3715,7 +3774,7 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference } // val_3 == val_2 (storage server value matches value passed into the function -> new watch) - if (valSS == parameters->value && tr.getTransactionState()->tenantId == parameters->tenant.tenantId) { + if (valSS == parameters->value && tr.getTransactionState()->tenantId() == parameters->tenant.tenantId) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); @@ -3758,7 +3817,8 @@ Future getWatchFuture(Database cx, Reference parameters) // case 3: val_1 != val_2 && version_2 > version_1 (received watch with different value and a higher version so // recreate in SS) else if (parameters->version > metadata->parameters->version) { - TEST(true); // Setting a watch that has a different value than the one in the map but a higher version (newer) + CODE_PROBE(true, + "Setting a watch that has a different value than the one in the map but a higher version (newer)"); cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); @@ -3773,10 +3833,10 @@ Future getWatchFuture(Database cx, Reference parameters) } // case 5: val_1 != val_2 && version_1 == version_2 (received watch with different value but same version) else if (metadata->parameters->version == parameters->version) { - TEST(true); // Setting a watch which has a different value than the one in the map but the same version + CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but the same version"); return sameVersionDiffValue(cx, parameters); } - TEST(true); // Setting a watch which has a different value than the one in the map but a lower version (older) + CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but a lower version (older)"); // case 4: val_1 != val_2 && version_2 < version_1 return Void(); @@ -3897,13 +3957,15 @@ Future getExactRange(Reference trState, // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; + + req.options = trState->readOptions; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.Before"); - /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->debugID.get()) + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.Before"); + /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("ReqLimit", req.limit) @@ -3933,9 +3995,10 @@ Future getExactRange(Reference trState, ++trState->cx->transactionPhysicalReadsCompleted; throw; } - if (trState->debugID.present()) - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.After"); output.arena().dependsOn(rep.arena); output.append(output.arena(), rep.data.begin(), rep.data.size()); @@ -3970,7 +4033,7 @@ Future getExactRange(Reference trState, .detail("BlockBytes", rep.data.expectedSize()); ASSERT(false); } - TEST(true); // GetKeyValuesFamilyReply.more in getExactRange + CODE_PROBE(true, "GetKeyValuesFamilyReply.more in getExactRange"); // Make next request to the same shard with a beginning key just after the last key returned if (reverse) locations[shard].range = @@ -3981,7 +4044,7 @@ Future getExactRange(Reference trState, } if (!more || locations[shard].range.empty()) { - TEST(true); // getExactrange (!more || locations[shard].first.empty()) + CODE_PROBE(true, "getExactrange (!more || locations[shard].first.empty())"); if (shard == locations.size() - 1) { const KeyRangeRef& range = locations[shard].range; KeyRef begin = reverse ? keys.begin : range.end; @@ -3991,7 +4054,7 @@ Future getExactRange(Reference trState, output.more = false; return output; } - TEST(true); // Multiple requests of key locations + CODE_PROBE(true, "Multiple requests of key locations"); keys = KeyRangeRef(begin, end); break; @@ -4022,9 +4085,8 @@ Future getExactRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); break; } else { TraceEvent(SevInfo, "GetExactRangeError") @@ -4264,7 +4326,7 @@ Future getRange(Reference trState, req.arena.dependsOn(mapper.arena()); setMatchIndex(req, matchIndex); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); - req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); + req.options = trState->readOptions; req.version = readVersion; trState->cx->getLatestCommitVersions( @@ -4302,13 +4364,13 @@ Future getRange(Reference trState, ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; req.spanContext = span.context; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Before"); - /*TraceEvent("TransactionDebugGetRangeInfo", trState->debugID.get()) + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getRange.Before"); + /*TraceEvent("TransactionDebugGetRangeInfo", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("OriginalBegin", originalBegin.toString()) @@ -4347,11 +4409,11 @@ Future getRange(Reference trState, throw; } - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { g_traceBatch.addEvent("TransactionDebug", - trState->debugID.get().first(), + trState->readOptions.get().debugID.get().first(), "NativeAPI.getRange.After"); //.detail("SizeOf", rep.data.size()); - /*TraceEvent("TransactionDebugGetRangeDone", trState->debugID.get()) + /*TraceEvent("TransactionDebugGetRangeDone", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("RepIsMore", rep.more) @@ -4431,7 +4493,7 @@ Future getRange(Reference trState, if (!rep.more) { ASSERT(modifiedSelectors); - TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange + CODE_PROBE(true, "!GetKeyValuesFamilyReply.more and modifiedSelectors in getRange"); if (!rep.data.size()) { RangeResultFamily result = wait( @@ -4455,7 +4517,7 @@ Future getRange(Reference trState, else begin = firstGreaterOrEqual(shard.end); } else { - TEST(true); // GetKeyValuesFamilyReply.more in getRange + CODE_PROBE(true, "GetKeyValuesFamilyReply.more in getRange"); if (reverse) end = firstGreaterOrEqual(output[output.size() - 1].key); else @@ -4463,10 +4525,11 @@ Future getRange(Reference trState, } } catch (Error& e) { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Error"); - TraceEvent("TransactionDebugError", trState->debugID.get()).error(e); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getRange.Error"); + TraceEvent("TransactionDebugError", trState->readOptions.get().debugID.get()).error(e); } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && readVersion == latestVersion)) { @@ -4493,9 +4556,8 @@ Future getRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); } else { if (trState->trLogInfo) trState->trLogInfo->addLog( @@ -4574,7 +4636,7 @@ static Future tssStreamComparison(Request request, } else { tssData.metrics->ssError(e.code()); } - TEST(e.code() != error_code_end_of_stream); // SS got error in TSS stream comparison + CODE_PROBE(e.code() != error_code_end_of_stream, "SS got error in TSS stream comparison"); } state double sleepTime = std::max(startTime + FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT - now(), 0.0); @@ -4586,7 +4648,7 @@ static Future tssStreamComparison(Request request, } when(wait(delay(sleepTime))) { ++tssData.metrics->tssTimeouts; - TEST(true); // Got TSS timeout in stream comparison + CODE_PROBE(true, "Got TSS timeout in stream comparison"); } } } catch (Error& e) { @@ -4601,7 +4663,7 @@ static Future tssStreamComparison(Request request, } else { tssData.metrics->tssError(e.code()); } - TEST(e.code() != error_code_end_of_stream); // TSS got error in TSS stream comparison + CODE_PROBE(e.code() != error_code_end_of_stream, "TSS got error in TSS stream comparison"); } if (!ssEndOfStream || !tssEndOfStream) { @@ -4614,13 +4676,12 @@ static Future tssStreamComparison(Request request, // FIXME: this code is pretty much identical to LoadBalance.h // TODO could add team check logic in if we added synchronous way to turn this into a fixed getRange request // and send it to the whole team and compare? I think it's fine to skip that for streaming though - TEST(ssEndOfStream != tssEndOfStream); // SS or TSS stream finished early! // skip tss comparison if both are end of stream if ((!ssEndOfStream || !tssEndOfStream) && !TSS_doCompare(ssReply.get(), tssReply.get())) { - TEST(true); // TSS mismatch in stream comparison + CODE_PROBE(true, "TSS mismatch in stream comparison"); TraceEvent mismatchEvent( - (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations) + (g_network->isSimulated() && g_simulator->tssMode == ISimulator::TSSMode::EnabledDropMutations) ? SevWarnAlways : SevError, TSS_mismatchTraceName(request)); @@ -4630,10 +4691,10 @@ static Future tssStreamComparison(Request request, if (tssData.metrics->shouldRecordDetailedMismatch()) { TSS_traceMismatch(mismatchEvent, request, ssReply.get(), tssReply.get()); - TEST(FLOW_KNOBS - ->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Full TSS Mismatch in stream comparison - TEST(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL); // Tracing Partial TSS Mismatch in stream - // comparison and storing the rest in FDB + CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL, + "Tracing Full TSS Mismatch in stream comparison"); + CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL, + "Tracing Partial TSS Mismatch in stream comparison and storing the rest in FDB"); if (!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL) { mismatchEvent.disable(); @@ -4642,7 +4703,7 @@ static Future tssStreamComparison(Request request, // record a summarized trace event instead TraceEvent summaryEvent((g_network->isSimulated() && - g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations) + g_simulator->tssMode == ISimulator::TSSMode::EnabledDropMutations) ? SevWarnAlways : SevError, TSS_mismatchTraceName(request)); @@ -4673,7 +4734,7 @@ maybeDuplicateTSSStreamFragment(Request& req, QueueModel* model, RequestStream tssData = model->getTssData(ssStream->getEndpoint().token.first()); if (tssData.present()) { - TEST(true); // duplicating stream to TSS + CODE_PROBE(true, "duplicating stream to TSS"); resetReply(req); // FIXME: optimize to avoid creating new netNotifiedQueueWithAcknowledgements for each stream duplication RequestStream tssRequestStream(tssData.get().endpoint); @@ -4719,9 +4780,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, req.spanContext = spanContext; req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limitBytes = std::numeric_limits::max(); - // leaving the flag off for now to prevent data fetches stall under heavy load - // it is used to inform the storage that the rangeRead is for Fetch - // req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); + req.options = trState->readOptions; + trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); @@ -4732,12 +4792,12 @@ ACTOR Future getRangeStreamFragment(Reference trState, // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.RangeStream.Before"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.RangeStream.Before"); } ++trState->cx->transactionPhysicalReads; state GetKeyValuesStreamReply rep; @@ -4831,9 +4891,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, } rep = GetKeyValuesStreamReply(); } - if (trState->debugID.present()) - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.After"); RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena); if (tssDuplicateStream.present() && !tssDuplicateStream.get().done()) { @@ -4873,7 +4934,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, .detail("BlockBytes", rep.data.expectedSize()); ASSERT(false); } - TEST(true); // GetKeyValuesStreamReply.more in getRangeStream + CODE_PROBE(true, "GetKeyValuesStreamReply.more in getRangeStream"); // Make next request to the same shard with a beginning key just after the last key returned if (reverse) locations[shard].range = @@ -4941,7 +5002,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, throw; } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || - e.code() == error_code_connection_failed) { + e.code() == error_code_connection_failed || e.code() == error_code_request_maybe_delivered) { const KeyRangeRef& range = locations[shard].range; if (reverse) @@ -4954,9 +5015,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + wait(trState->handleUnknownTenant()); break; } else { results->sendError(e); @@ -5239,7 +5298,7 @@ ACTOR Future getTenantMetadata(Reference trState, Future populateAndGetTenant(Reference trState, Key const& key, Version version) { if (!trState->tenant().present() || key == metadataVersionKey) { return TenantInfo(); - } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { + } else if (trState->tenantId() != TenantInfo::INVALID_TENANT) { return trState->getTenantInfo(); } else { return getTenantMetadata(trState, key, version); @@ -5271,7 +5330,7 @@ ACTOR Future watch(Reference watch, when(wait(watch->watchFuture)) { break; } when(wait(cx->connectionFileChanged())) { - TEST(true); // Recreated a watch after switch + CODE_PROBE(true, "Recreated a watch after switch"); cx->clearWatchMetadata(); watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion, tenantInfo, @@ -5314,7 +5373,7 @@ Future Transaction::watch(Reference watch) { trState->options.readTags, trState->spanContext, trState->taskID, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies); } @@ -5444,18 +5503,18 @@ Future Transaction::getRangeInternal(const KeySelector& begin KeySelector b = begin; if (b.orEqual) { - TEST(true); // Native begin orEqual==true + CODE_PROBE(true, "Native begin orEqual==true"); b.removeOrEqual(b.arena()); } KeySelector e = end; if (e.orEqual) { - TEST(true); // Native end orEqual==true + CODE_PROBE(true, "Native end orEqual==true"); e.removeOrEqual(e.arena()); } if (b.offset >= e.offset && b.getKey() >= e.getKey()) { - TEST(true); // Native range inverted + CODE_PROBE(true, "Native range inverted"); return RangeResultFamily(); } @@ -5504,7 +5563,7 @@ Future Transaction::getRange(const KeySelector& begin, // A method for streaming data from the storage server that is more efficient than getRange when reading large amounts // of data -Future Transaction::getRangeStream(const PromiseStream& results, +Future Transaction::getRangeStream(PromiseStream& results, const KeySelector& begin, const KeySelector& end, GetRangeLimits limits, @@ -5518,18 +5577,18 @@ Future Transaction::getRangeStream(const PromiseStream& resul KeySelector b = begin; if (b.orEqual) { - TEST(true); // Native stream begin orEqual==true + CODE_PROBE(true, "Native stream begin orEqual==true"); b.removeOrEqual(b.arena()); } KeySelector e = end; if (e.orEqual) { - TEST(true); // Native stream end orEqual==true + CODE_PROBE(true, "Native stream end orEqual==true"); e.removeOrEqual(e.arena()); } if (b.offset >= e.offset && b.getKey() >= e.getKey()) { - TEST(true); // Native stream range inverted + CODE_PROBE(true, "Native stream range inverted"); results.sendError(end_of_stream()); return Void(); } @@ -5543,7 +5602,7 @@ Future Transaction::getRangeStream(const PromiseStream& resul ::getRangeStream(trState, results, getReadVersion(), b, e, limits, conflictRange, snapshot, reverse), results); } -Future Transaction::getRangeStream(const PromiseStream& results, +Future Transaction::getRangeStream(PromiseStream& results, const KeySelector& begin, const KeySelector& end, int limit, @@ -5580,7 +5639,7 @@ void Transaction::addReadConflictRange(KeyRangeRef const& keys) { void Transaction::makeSelfConflicting() { BinaryWriter wr(Unversioned()); - wr.serializeBytes(LiteralStringRef("\xFF/SC/")); + wr.serializeBytes("\xFF/SC/"_sr); wr << deterministicRandom()->randomUniqueID(); auto r = singleKeyRange(wr.toValue(), tr.arena); tr.transaction.read_conflict_ranges.push_back(tr.arena, r); @@ -5632,7 +5691,7 @@ void Transaction::atomicOp(const KeyRef& key, if (addConflictRange && operationType != MutationRef::SetVersionstampedKey) t.write_conflict_ranges.push_back(req.arena, r); - TEST(true); // NativeAPI atomic operation + CODE_PROBE(true, "NativeAPI atomic operation"); } void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRange) { @@ -5718,7 +5777,7 @@ double Transaction::getBackoff(int errCode) { if (priorityItr != trState->cx->throttledTags.end()) { auto tagItr = priorityItr->second.find(tag); if (tagItr != priorityItr->second.end()) { - TEST(true); // Returning throttle backoff + CODE_PROBE(true, "Returning throttle backoff"); returnedBackoff = std::max( returnedBackoff, std::min(CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL, tagItr->second.throttleDuration())); @@ -5733,7 +5792,9 @@ double Transaction::getBackoff(int errCode) { returnedBackoff *= deterministicRandom()->random01(); // Set backoff for next time - if (errCode == error_code_proxy_memory_limit_exceeded) { + if (errCode == error_code_commit_proxy_memory_limit_exceeded || + errCode == error_code_grv_proxy_memory_limit_exceeded) { + backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, CLIENT_KNOBS->RESOURCE_CONSTRAINED_MAX_BACKOFF); } else { backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, trState->options.maxBackoff); @@ -5924,8 +5985,12 @@ ACTOR void checkWrites(Reference trState, } } -ACTOR static Future commitDummyTransaction(Reference trState, KeyRange range) { - state Transaction tr(trState->cx); +FDB_BOOLEAN_PARAM(TenantPrefixPrepended); + +ACTOR static Future commitDummyTransaction(Reference trState, + KeyRange range, + TenantPrefixPrepended tenantPrefixPrepended) { + state Transaction tr(trState->cx, trState->tenant()); state int retries = 0; state Span span("NAPI:dummyTransaction"_loc, trState->spanContext); tr.span.setParent(span.context); @@ -5934,7 +5999,14 @@ ACTOR static Future commitDummyTransaction(Reference trS TraceEvent("CommitDummyTransaction").detail("Key", range.begin).detail("Retries", retries); tr.trState->options = trState->options; tr.trState->taskID = trState->taskID; - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.trState->authToken = trState->authToken; + tr.trState->trySetTenantId(trState->tenantId()); + if (!trState->hasTenant()) { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); + } else { + tr.trState->skipApplyTenantPrefix = tenantPrefixPrepended; + CODE_PROBE(true, "Commit of a dummy transaction in tenant keyspace"); + } tr.setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY); tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.addReadConflictRange(range); @@ -5942,6 +6014,10 @@ ACTOR static Future commitDummyTransaction(Reference trS wait(tr.commit()); return Void(); } catch (Error& e) { + // If the tenant is gone, then our original transaction won't be able to commit + if (e.code() == error_code_unknown_tenant) { + return Void(); + } TraceEvent("CommitDummyTransactionError") .errorUnsuppressed(e) .detail("Key", range.begin) @@ -5965,16 +6041,17 @@ void Transaction::setupWatches() { Future watchVersion = getCommittedVersion() > 0 ? getCommittedVersion() : getReadVersion(); for (int i = 0; i < watches.size(); ++i) - watches[i]->setWatch(watchValueMap(watchVersion, - trState->getTenantInfo(), - watches[i]->key, - watches[i]->value, - trState->cx, - trState->options.readTags, - trState->spanContext, - trState->taskID, - trState->debugID, - trState->useProvisionalProxies)); + watches[i]->setWatch( + watchValueMap(watchVersion, + trState->getTenantInfo(), + watches[i]->key, + watches[i]->value, + trState->cx, + trState->options.readTags, + trState->spanContext, + trState->taskID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), + trState->useProvisionalProxies)); watches.clear(); } catch (Error&) { @@ -6063,30 +6140,46 @@ ACTOR Future> estimateCommitCosts(Referen // TODO: send the prefix as part of the commit request and ship it all the way // through to the storage servers void applyTenantPrefix(CommitTransactionRequest& req, Key tenantPrefix) { + VectorRef updatedMutations; + updatedMutations.reserve(req.arena, req.transaction.mutations.size()); for (auto& m : req.transaction.mutations) { + StringRef param1 = m.param1; + StringRef param2 = m.param2; if (m.param1 != metadataVersionKey) { - m.param1 = m.param1.withPrefix(tenantPrefix, req.arena); + param1 = m.param1.withPrefix(tenantPrefix, req.arena); if (m.type == MutationRef::ClearRange) { - m.param2 = m.param2.withPrefix(tenantPrefix, req.arena); + param2 = m.param2.withPrefix(tenantPrefix, req.arena); } else if (m.type == MutationRef::SetVersionstampedKey) { - uint8_t* key = mutateString(m.param1); - int* offset = reinterpret_cast(&key[m.param1.size() - 4]); + uint8_t* key = mutateString(param1); + int* offset = reinterpret_cast(&key[param1.size() - 4]); *offset += tenantPrefix.size(); } } + updatedMutations.push_back(req.arena, MutationRef(MutationRef::Type(m.type), param1, param2)); } + req.transaction.mutations = updatedMutations; - for (auto& rc : req.transaction.read_conflict_ranges) { + VectorRef updatedReadConflictRanges; + updatedReadConflictRanges.reserve(req.arena, req.transaction.read_conflict_ranges.size()); + for (auto const& rc : req.transaction.read_conflict_ranges) { if (rc.begin != metadataVersionKey) { - rc = rc.withPrefix(tenantPrefix, req.arena); + updatedReadConflictRanges.push_back(req.arena, rc.withPrefix(tenantPrefix, req.arena)); + } else { + updatedReadConflictRanges.push_back(req.arena, rc); } } + req.transaction.read_conflict_ranges = updatedReadConflictRanges; + VectorRef updatedWriteConflictRanges; + updatedWriteConflictRanges.reserve(req.arena, req.transaction.write_conflict_ranges.size()); for (auto& wc : req.transaction.write_conflict_ranges) { if (wc.begin != metadataVersionKey) { - wc = wc.withPrefix(tenantPrefix, req.arena); + updatedWriteConflictRanges.push_back(req.arena, wc.withPrefix(tenantPrefix, req.arena)); + } else { + updatedWriteConflictRanges.push_back(req.arena, wc); } } + req.transaction.write_conflict_ranges = updatedWriteConflictRanges; } ACTOR static Future tryCommit(Reference trState, @@ -6095,14 +6188,18 @@ ACTOR static Future tryCommit(Reference trState, state TraceInterval interval("TransactionCommit"); state double startTime = now(); state Span span("NAPI:tryCommit"_loc, trState->spanContext); - state Optional debugID = trState->debugID; + state Optional debugID = trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(); + state TenantPrefixPrepended tenantPrefixPrepended = TenantPrefixPrepended::False; if (debugID.present()) { TraceEvent(interval.begin()).detail("Parent", debugID.get()); } try { if (CLIENT_BUGGIFY) { - throw deterministicRandom()->randomChoice(std::vector{ - not_committed(), transaction_too_old(), proxy_memory_limit_exceeded(), commit_unknown_result() }); + throw deterministicRandom()->randomChoice(std::vector{ not_committed(), + transaction_too_old(), + commit_proxy_memory_limit_exceeded(), + grv_proxy_memory_limit_exceeded(), + commit_unknown_result() }); } if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) { @@ -6113,7 +6210,9 @@ ACTOR static Future tryCommit(Reference trState, } state Key tenantPrefix; - if (trState->tenant().present()) { + // skipApplyTenantPrefix is set only in the context of a commitDummyTransaction() + // (see member declaration) + if (trState->tenant().present() && !trState->skipApplyTenantPrefix) { KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue, @@ -6121,11 +6220,11 @@ ACTOR static Future tryCommit(Reference trState, UseTenant::True, req.transaction.read_snapshot)); applyTenantPrefix(req, locationInfo.tenantEntry.prefix); + tenantPrefixPrepended = TenantPrefixPrepended::True; tenantPrefix = locationInfo.tenantEntry.prefix; } - + CODE_PROBE(trState->skipApplyTenantPrefix, "Tenant prefix prepend skipped for dummy transaction"); req.tenantInfo = trState->getTenantInfo(); - startTime = now(); state Optional commitID = Optional(); @@ -6249,21 +6348,25 @@ ACTOR static Future tryCommit(Reference trState, KeyRangeRef selfConflictingRange = intersects(req.transaction.write_conflict_ranges, req.transaction.read_conflict_ranges).get(); - TEST(true); // Waiting for dummy transaction to report commit_unknown_result + CODE_PROBE(true, "Waiting for dummy transaction to report commit_unknown_result"); - wait(commitDummyTransaction(trState, singleKeyRange(selfConflictingRange.begin))); + wait( + commitDummyTransaction(trState, singleKeyRange(selfConflictingRange.begin), tenantPrefixPrepended)); } // The user needs to be informed that we aren't sure whether the commit happened. Standard retry loops // retry it anyway (relying on transaction idempotence) but a client might do something else. throw commit_unknown_result(); } else if (e.code() == error_code_unknown_tenant) { + // Rather than reset the tenant and retry just the commit, we need to throw this error to the user and let + // them retry the whole transaction ASSERT(trState->tenant().present()); trState->cx->invalidateCachedTenant(trState->tenant().get()); throw; } else { if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && - e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded && + e.code() != error_code_database_locked && e.code() != error_code_commit_proxy_memory_limit_exceeded && + e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled && e.code() != error_code_process_behind && e.code() != error_code_future_version && e.code() != error_code_tenant_not_found) { @@ -6489,10 +6592,10 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional(value.get().printable(), TransactionLogInfo::DONT_LOG); trState->trLogInfo->maxFieldLength = trState->options.maxTransactionLoggingFieldLength; } - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { TraceEvent(SevInfo, "TransactionBeingTraced") .detail("DebugTransactionID", trState->trLogInfo->identifier) - .detail("ServerTraceID", trState->debugID.get()); + .detail("ServerTraceID", trState->readOptions.get().debugID.get()); } break; @@ -6524,10 +6627,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, OptionalrandomUniqueID()); - if (trState->trLogInfo && !trState->trLogInfo->identifier.empty()) { + if (trState->trLogInfo && !trState->trLogInfo->identifier.empty() && trState->readOptions.present() && + trState->readOptions.get().debugID.present()) { TraceEvent(SevInfo, "TransactionBeingTraced") .detail("DebugTransactionID", trState->trLogInfo->identifier) - .detail("ServerTraceID", trState->debugID.get()); + .detail("ServerTraceID", trState->readOptions.get().debugID.get()); } break; @@ -6562,6 +6666,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, OptionalhasTenant()) { + Error e = invalid_option(); + TraceEvent(SevWarn, "TenantTransactionUseProvisionalProxies").error(e).detail("Tenant", trState->tenant()); + throw e; + } trState->options.getReadVersionFlags |= GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES; trState->useProvisionalProxies = UseProvisionalProxies::True; break; @@ -6587,7 +6696,7 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional(value.get(), IncludeVersion())); break; @@ -6603,6 +6712,9 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionalcx->sharedStatePtr) { + throw invalid_option(); + } if (trState->numErrors == 0) { trState->options.useGrvCache = true; } @@ -6626,6 +6738,13 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaloptions.rawAccess = true; break; + case FDBTransactionOptions::AUTHORIZATION_TOKEN: + if (value.present()) + trState->authToken = Standalone(value.get()); + else + trState->authToken.reset(); + break; + default: break; } @@ -6667,10 +6786,10 @@ ACTOR Future getConsistentReadVersion(SpanContext parentSpa for (auto& tag : tags) { auto itr = v.tagThrottleInfo.find(tag.first); if (itr == v.tagThrottleInfo.end()) { - TEST(true); // Removing client throttle + CODE_PROBE(true, "Removing client throttle"); priorityThrottledTags.erase(tag.first); } else { - TEST(true); // Setting client throttle + CODE_PROBE(true, "Setting client throttle"); auto result = priorityThrottledTags.try_emplace(tag.first, itr->second); if (!result.second) { result.first->second.update(itr->second); @@ -6695,10 +6814,15 @@ ACTOR Future getConsistentReadVersion(SpanContext parentSpa } } } catch (Error& e) { - if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled) + if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled && + e.code() != error_code_grv_proxy_memory_limit_exceeded) TraceEvent(SevError, "GetConsistentReadVersionError").error(e); if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) { wait(delayJittered(5.0)); + } else if (e.code() == error_code_grv_proxy_memory_limit_exceeded) { + // FIXME(xwang): the better way is to let this error broadcast to transaction.onError(e), otherwise the + // txn->cx counter doesn't make sense + wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); } else { throw; } @@ -6716,26 +6840,22 @@ ACTOR Future readVersionBatcher(DatabaseContext* cx, state Future timeout; state Optional debugID; state bool send_batch; - state Reference batchSizeDist = Histogram::getHistogram(LiteralStringRef("GrvBatcher"), - LiteralStringRef("ClientGrvBatchSize"), - Histogram::Unit::countLinear, - 0, - CLIENT_KNOBS->MAX_BATCH_SIZE * 2); + state Reference batchSizeDist = Histogram::getHistogram( + "GrvBatcher"_sr, "ClientGrvBatchSize"_sr, Histogram::Unit::countLinear, 0, CLIENT_KNOBS->MAX_BATCH_SIZE * 2); state Reference batchIntervalDist = - Histogram::getHistogram(LiteralStringRef("GrvBatcher"), - LiteralStringRef("ClientGrvBatchInterval"), + Histogram::getHistogram("GrvBatcher"_sr, + "ClientGrvBatchInterval"_sr, Histogram::Unit::microseconds, 0, CLIENT_KNOBS->GRV_BATCH_TIMEOUT * 1000000 * 2); - state Reference grvReplyLatencyDist = Histogram::getHistogram( - LiteralStringRef("GrvBatcher"), LiteralStringRef("ClientGrvReplyLatency"), Histogram::Unit::microseconds); + state Reference grvReplyLatencyDist = + Histogram::getHistogram("GrvBatcher"_sr, "ClientGrvReplyLatency"_sr, Histogram::Unit::microseconds); state double lastRequestTime = now(); state TransactionTagMap tags; // dynamic batching state PromiseStream replyTimes; - state PromiseStream _errorStream; state double batchTime = 0; state Span span("NAPI:readVersionBatcher"_loc); loop { @@ -6853,7 +6973,7 @@ ACTOR Future extractReadVersion(Reference trState, if (itr->second.expired()) { priorityThrottledTags.erase(itr); } else if (itr->second.throttleDuration() > 0) { - TEST(true); // throttling transaction after getting read version + CODE_PROBE(true, "throttling transaction after getting read version"); ++trState->cx->transactionReadVersionsThrottled; throw tag_throttled(); } @@ -6959,12 +7079,12 @@ Future Transaction::getReadVersion(uint32_t flags) { } if (maxThrottleDelay > 0.0 && !canRecheck) { // TODO: allow delaying? - TEST(true); // Throttling tag before GRV request + CODE_PROBE(true, "Throttling tag before GRV request"); ++trState->cx->transactionReadVersionsThrottled; readVersion = tag_throttled(); return readVersion; } else { - TEST(maxThrottleDelay > 0.0); // Rechecking throttle + CODE_PROBE(maxThrottleDelay > 0.0, "Rechecking throttle"); } for (auto& tag : trState->options.tags) { @@ -6983,7 +7103,9 @@ Future Transaction::getReadVersion(uint32_t flags) { Location location = "NAPI:getReadVersion"_loc; SpanContext spanContext = generateSpanID(trState->cx->transactionTracingSample, trState->spanContext); - auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, trState->debugID); + Optional versionDebugID = + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(); + auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, versionDebugID); batcher.stream.send(req); trState->startTime = now(); readVersion = extractReadVersion(trState, location, spanContext, req.reply.getFuture(), metadataVersion); @@ -7141,14 +7263,16 @@ Future Transaction::onError(Error const& e) { return client_invalid_operation(); } if (e.code() == error_code_not_committed || e.code() == error_code_commit_unknown_result || - e.code() == error_code_database_locked || e.code() == error_code_proxy_memory_limit_exceeded || - e.code() == error_code_process_behind || e.code() == error_code_batch_transaction_throttled || - e.code() == error_code_tag_throttled) { + e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded || + e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind || + e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled || + e.code() == error_code_blob_granule_request_failed) { if (e.code() == error_code_not_committed) ++trState->cx->transactionsNotCommitted; else if (e.code() == error_code_commit_unknown_result) ++trState->cx->transactionsMaybeCommitted; - else if (e.code() == error_code_proxy_memory_limit_exceeded) + else if (e.code() == error_code_commit_proxy_memory_limit_exceeded || + e.code() == error_code_grv_proxy_memory_limit_exceeded) ++trState->cx->transactionsResourceConstrained; else if (e.code() == error_code_process_behind) ++trState->cx->transactionsProcessBehind; @@ -7205,7 +7329,7 @@ ACTOR Future doGetStorageMetrics(Database cx, KeyRange keys, Ref ACTOR Future getStorageMetricsLargeKeyRange(Database cx, KeyRange keys) { state Span span("NAPI:GetStorageMetricsLargeKeyRange"_loc); std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, std::numeric_limits::max(), Reverse::False, @@ -7307,7 +7431,7 @@ ACTOR Future>> getReadHotRanges(Da // to find the read-hot sub ranges within a read-hot shard. std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, shardLimit, Reverse::False, @@ -7343,10 +7467,10 @@ ACTOR Future>> getReadHotRanges(Da wait(waitForAll(fReplies)); if (nLocs == 1) { - TEST(true); // Single-shard read hot range request + CODE_PROBE(true, "Single-shard read hot range request"); return fReplies[0].get().readHotRanges; } else { - TEST(true); // Multi-shard read hot range request + CODE_PROBE(true, "Multi-shard read hot range request"); Standalone> results; for (int i = 0; i < nLocs; i++) { results.append(results.arena(), @@ -7378,7 +7502,7 @@ ACTOR Future, int>> waitStorageMetrics(Databa state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample)); loop { std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, shardLimit, Reverse::False, @@ -7536,9 +7660,7 @@ ACTOR Future>> getRangeSplitPoints(Referencecx->invalidateCache(locations[0].tenantEntry.prefix, keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + wait(trState->handleUnknownTenant()); } else { TraceEvent(SevError, "GetRangeSplitPoints").error(e); throw; @@ -7552,18 +7674,19 @@ ACTOR Future blobGranuleGetTenantEntry(Transaction* self, Key ra Optional cachedLocationInfo = self->trState->cx->getCachedLocation(self->getTenant().get(), rangeStartKey, Reverse::False); if (!cachedLocationInfo.present()) { - KeyRangeLocationInfo l = wait(getKeyLocation_internal(self->trState->cx, - self->getTenant().get(), - rangeStartKey, - self->trState->spanContext, - self->trState->debugID, - self->trState->useProvisionalProxies, - Reverse::False, - latestVersion)); - self->trState->tenantId = l.tenantEntry.id; + KeyRangeLocationInfo l = wait(getKeyLocation_internal( + self->trState->cx, + self->trState->getTenantInfo(AllowInvalidTenantID::True), + rangeStartKey, + self->trState->spanContext, + self->trState->readOptions.present() ? self->trState->readOptions.get().debugID : Optional(), + self->trState->useProvisionalProxies, + Reverse::False, + latestVersion)); + self->trState->trySetTenantId(l.tenantEntry.id); return l.tenantEntry; } else { - self->trState->tenantId = cachedLocationInfo.get().tenantEntry.id; + self->trState->trySetTenantId(cachedLocationInfo.get().tenantEntry.id); return cachedLocationInfo.get().tenantEntry; } } @@ -7577,7 +7700,9 @@ Future>> Transaction::getRangeSplitPoints(KeyRange // the blob granule requests are a bit funky because they piggyback off the existing transaction to read from the system // keyspace -ACTOR Future>> getBlobGranuleRangesActor(Transaction* self, KeyRange keyRange) { +ACTOR Future>> getBlobGranuleRangesActor(Transaction* self, + KeyRange keyRange, + int rangeLimit) { // FIXME: use streaming range read state KeyRange currentRange = keyRange; state Standalone> results; @@ -7598,23 +7723,24 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans if (tenantPrefix.present()) { state Standalone mappingPrefix = tenantPrefix.get().withPrefix(blobGranuleMappingKeys.begin); - // basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with - // UseTenant::False - GetRangeLimits limits(1000); + // basically krmGetRangeUnaligned, but enable it to not use tenant without RAW_ACCESS by doing manual + // getRange with UseTenant::False + GetRangeLimits limits(2 * rangeLimit + 2); limits.minRows = 2; + RangeResult rawMapping = wait(getRange(self->trState, self->getReadVersion(), lastLessOrEqual(keyRange.begin.withPrefix(mappingPrefix)), - firstGreaterThan(keyRange.end.withPrefix(mappingPrefix)), + KeySelectorRef(keyRange.end.withPrefix(mappingPrefix), false, +2), limits, Reverse::False, UseTenant::False)); // strip off mapping prefix - blobGranuleMapping = krmDecodeRanges(mappingPrefix, currentRange, rawMapping); + blobGranuleMapping = krmDecodeRanges(mappingPrefix, currentRange, rawMapping, false); } else { wait(store( blobGranuleMapping, - krmGetRanges( + krmGetRangesUnaligned( self, blobGranuleMappingKeys.begin, currentRange, 1000, GetRangeLimits::BYTE_LIMIT_UNLIMITED))); } @@ -7622,6 +7748,9 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans if (blobGranuleMapping[i].value.size()) { results.push_back(results.arena(), KeyRangeRef(blobGranuleMapping[i].key, blobGranuleMapping[i + 1].key)); + if (results.size() == rangeLimit) { + return results; + } } } results.arena().dependsOn(blobGranuleMapping.arena()); @@ -7633,8 +7762,8 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans } } -Future>> Transaction::getBlobGranuleRanges(const KeyRange& range) { - return ::getBlobGranuleRangesActor(this, range); +Future>> Transaction::getBlobGranuleRanges(const KeyRange& range, int rangeLimit) { + return ::getBlobGranuleRangesActor(this, range, rangeLimit); } // hack (for now) to get blob worker interface into load balance @@ -7648,7 +7777,11 @@ ACTOR Future>> readBlobGranulesActor( KeyRange range, Version begin, Optional read, - Version* readVersionOut) { // read not present is "use transaction version" + Version* readVersionOut, + int chunkLimit, + bool summarize) { // read not present is "use transaction version" + + ASSERT(chunkLimit > 0); state RangeResult blobGranuleMapping; state Key granuleStartKey; @@ -7689,7 +7822,7 @@ ACTOR Future>> readBlobGranulesActor( // basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with // UseTenant::False - GetRangeLimits limits(1000); + GetRangeLimits limits(CLIENT_KNOBS->BG_TOO_MANY_GRANULES); limits.minRows = 2; RangeResult rawMapping = wait(getRange(self->trState, self->getReadVersion(), @@ -7704,19 +7837,24 @@ ACTOR Future>> readBlobGranulesActor( blobGranuleMapping = krmDecodeRanges(prefix, range, rawMapping); } else { self->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - wait(store( - blobGranuleMapping, - krmGetRanges(self, blobGranuleMappingKeys.begin, keyRange, 1000, GetRangeLimits::BYTE_LIMIT_UNLIMITED))); + wait(store(blobGranuleMapping, + krmGetRanges(self, + blobGranuleMappingKeys.begin, + keyRange, + CLIENT_KNOBS->BG_TOO_MANY_GRANULES, + GetRangeLimits::BYTE_LIMIT_UNLIMITED))); } if (blobGranuleMapping.more) { if (BG_REQUEST_DEBUG) { fmt::print( "BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable()); } - TraceEvent(SevWarn, "BGMappingTooLarge").detail("Range", range).detail("Max", 1000); + TraceEvent(SevWarn, "BGMappingTooLarge") + .detail("Range", range) + .detail("Max", CLIENT_KNOBS->BG_TOO_MANY_GRANULES); throw unsupported_operation(); } - ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() < CLIENT_KNOBS->TOO_MANY); + ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() <= CLIENT_KNOBS->BG_TOO_MANY_GRANULES); if (blobGranuleMapping.size() < 2) { throw blob_granule_transaction_too_old(); @@ -7735,7 +7873,6 @@ ACTOR Future>> readBlobGranulesActor( fmt::print("Key range [{0} - {1}) missing worker assignment!\n", granuleStartKey.printable(), granuleEndKey.printable()); - // TODO probably new exception type instead } throw blob_granule_transaction_too_old(); } @@ -7762,11 +7899,9 @@ ACTOR Future>> readBlobGranulesActor( getValue(self->trState, blobWorkerListKeyFor(workerId), self->getReadVersion(), UseTenant::False))); // from the time the mapping was read from the db, the associated blob worker // could have died and so its interface wouldn't be present as part of the blobWorkerList - // we persist in the db. So throw wrong_shard_server to get the new mapping + // we persist in the db. So throw blob_granule_request_failed to get the new mapping if (!workerInterface.present()) { - // need to re-read mapping, throw transaction_too_old so client retries. TODO better error? - // throw wrong_shard_server(); - throw transaction_too_old(); + throw blob_granule_request_failed(); } // FIXME: maybe just want to insert here if there are racing queries for the same worker or something? self->trState->cx->blobWorker_interf[workerId] = decodeBlobWorkerListValue(workerInterface.get()); @@ -7799,6 +7934,7 @@ ACTOR Future>> readBlobGranulesActor( req.readVersion = rv; req.tenantInfo = self->getTenant().present() ? self->trState->getTenantInfo() : TenantInfo(); req.canCollapseBegin = true; // TODO make this a parameter once we support it + req.summarize = summarize; std::vector>> v; v.push_back( @@ -7855,7 +7991,7 @@ ACTOR Future>> readBlobGranulesActor( if (!results.empty() && results.back().keyRange.end != chunk.keyRange.begin) { ASSERT(results.back().keyRange.end > chunk.keyRange.begin); ASSERT(results.back().keyRange.end <= chunk.keyRange.end); - TEST(true); // Merge while reading granule range + CODE_PROBE(true, "Merge while reading granule range"); while (!results.empty() && results.back().keyRange.begin >= chunk.keyRange.begin) { // TODO: we can't easily un-depend the arenas for these guys, but that's ok as this // should be rare @@ -7869,6 +8005,12 @@ ACTOR Future>> readBlobGranulesActor( chunkEndKey = chunkEndKey.removePrefix(tenantPrefix.get()); } keyRange = KeyRangeRef(std::min(chunkEndKey, keyRange.end), keyRange.end); + if (summarize && results.size() == chunkLimit) { + break; + } + } + if (summarize && results.size() == chunkLimit) { + break; } } // if we detect that this blob worker fails, cancel the request, as otherwise load balance will @@ -7894,10 +8036,8 @@ ACTOR Future>> readBlobGranulesActor( e.name()); } // worker is up but didn't actually have granule, or connection failed - if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed || - e.code() == error_code_unknown_tenant) { - // need to re-read mapping, throw transaction_too_old so client retries. TODO better error? - throw transaction_too_old(); + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed) { + throw blob_granule_request_failed(); } throw e; } @@ -7917,7 +8057,36 @@ Future>> Transaction::readBlobGranules Version begin, Optional readVersion, Version* readVersionOut) { - return readBlobGranulesActor(this, range, begin, readVersion, readVersionOut); + return readBlobGranulesActor( + this, range, begin, readVersion, readVersionOut, std::numeric_limits::max(), false); +} + +ACTOR Future>> summarizeBlobGranulesActor(Transaction* self, + KeyRange range, + Optional summaryVersion, + int rangeLimit) { + state Version readVersionOut; + Standalone> chunks = + wait(readBlobGranulesActor(self, range, 0, summaryVersion, &readVersionOut, rangeLimit, true)); + ASSERT(chunks.size() <= rangeLimit); + ASSERT(!summaryVersion.present() || readVersionOut == summaryVersion.get()); + Standalone> summaries; + summaries.reserve(summaries.arena(), chunks.size()); + for (auto& it : chunks) { + summaries.push_back(summaries.arena(), summarizeGranuleChunk(summaries.arena(), it)); + } + + return summaries; +} + +Future>> +Transaction::summarizeBlobGranules(const KeyRange& range, Optional summaryVersion, int rangeLimit) { + return summarizeBlobGranulesActor(this, range, summaryVersion, rangeLimit); +} + +void Transaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) { + trState->cx->bgReadInputBytes += stats.inputBytes; + trState->cx->bgReadOutputBytes += stats.outputBytes; } ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) { @@ -7941,6 +8110,112 @@ ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAw return version; } +ACTOR Future checkBlobSubrange(Database db, KeyRange keyRange, Optional version) { + state Transaction tr(db); + loop { + try { + state Version summaryVersion; + if (version.present()) { + summaryVersion = version.get(); + } else { + wait(store(summaryVersion, tr.getReadVersion())); + } + // same properties as a read for validating granule is readable, just much less memory and network bandwidth + // used + wait(success(tr.summarizeBlobGranules(keyRange, summaryVersion, std::numeric_limits::max()))); + return summaryVersion; + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + +ACTOR Future verifyBlobRangeActor(Reference cx, KeyRange range, Optional version) { + state Database db(cx); + state Transaction tr(db); + state Standalone> allRanges; + state KeyRange curRegion = KeyRangeRef(range.begin, range.begin); + state Version readVersionOut = invalidVersion; + state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2; + state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize; + + if (version.present()) { + if (version.get() == latestVersion) { + loop { + try { + Version _version = wait(tr.getReadVersion()); + version = _version; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + if (version.get() <= 0) { + TraceEvent("VerifyBlobInvalidVersion").detail("Range", range).detail("Version", version); + throw unsupported_operation(); + } + } + + loop { + if (curRegion.begin >= range.end) { + return readVersionOut; + } + loop { + try { + wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize))); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + if (allRanges.empty()) { + if (curRegion.begin < range.end) { + return invalidVersion; + } + return readVersionOut; + } + + state std::vector> checkParts; + // Chunk up to smaller ranges than this limit. Must be smaller than BG_TOO_MANY_GRANULES to not hit the limit + int batchCount = 0; + for (auto& it : allRanges) { + if (it.begin > curRegion.end) { + return invalidVersion; + } + + curRegion = KeyRangeRef(curRegion.begin, it.end); + batchCount++; + + if (batchCount == batchSize) { + checkParts.push_back(checkBlobSubrange(db, curRegion, version)); + batchCount = 0; + curRegion = KeyRangeRef(curRegion.end, curRegion.end); + } + } + if (!curRegion.empty()) { + checkParts.push_back(checkBlobSubrange(db, curRegion, version)); + } + + try { + wait(waitForAll(checkParts)); + } catch (Error& e) { + if (e.code() == error_code_blob_granule_transaction_too_old) { + return invalidVersion; + } + throw e; + } + ASSERT(!checkParts.empty()); + readVersionOut = checkParts.back().get(); + curRegion = KeyRangeRef(curRegion.end, curRegion.end); + } +} + +Future DatabaseContext::verifyBlobRange(const KeyRange& range, Optional version) { + return verifyBlobRangeActor(Reference::addRef(this), range, version); +} + ACTOR Future>> readStorageWiggleValues(Database cx, bool primary, bool use_system_priority) { @@ -7948,7 +8223,8 @@ ACTOR Future>> readStorageWiggleV state KeyBackedObjectMap metadataMap(readKey, IncludeVersion()); state Reference tr(new ReadYourWritesTransaction(cx)); - state std::vector> res; + state KeyBackedRangeResult> res; + // read the wiggling pairs loop { try { @@ -7964,7 +8240,7 @@ ACTOR Future>> readStorageWiggleV wait(tr->onError(e)); } } - return res; + return res.results; } ACTOR Future splitStorageMetricsStream(PromiseStream resultStream, @@ -7982,7 +8258,7 @@ ACTOR Future splitStorageMetricsStream(PromiseStream resultStream, loop { state std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), KeyRangeRef(beginKey, keys.end), CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, Reverse::False, @@ -8082,7 +8358,7 @@ ACTOR Future>> splitStorageMetrics(Database cx, loop { state std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, Reverse::False, @@ -8167,7 +8443,7 @@ Reference Transaction::createTrLogInfoProbabilistically(cons cx->globalConfig->get(fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY); if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && - (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) { + (!g_network->isSimulated() || !g_simulator->speedUpSimulation)) { return makeReference(TransactionLogInfo::DATABASE); } } @@ -8327,7 +8603,7 @@ ACTOR Future> getCheckpointMetaData(Database cx, try { state std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, CLIENT_KNOBS->TOO_MANY, Reverse::False, @@ -8488,16 +8764,13 @@ ACTOR static Future rebootWorkerActor(DatabaseContext* cx, ValueRef add for (const auto& it : kvs) { ClientWorkerInterface workerInterf = BinaryReader::fromStringRef(it.value, IncludeVersion()); - Key primaryAddress = - it.key.endsWith(LiteralStringRef(":tls")) ? it.key.removeSuffix(LiteralStringRef(":tls")) : it.key; + Key primaryAddress = it.key.endsWith(":tls"_sr) ? it.key.removeSuffix(":tls"_sr) : it.key; workerInterfaces[primaryAddress] = workerInterf; // Also add mapping from a worker's second address(if present) to its interface if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { Key secondAddress = StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - secondAddress = secondAddress.endsWith(LiteralStringRef(":tls")) - ? secondAddress.removeSuffix(LiteralStringRef(":tls")) - : secondAddress; + secondAddress = secondAddress.endsWith(":tls"_sr) ? secondAddress.removeSuffix(":tls"_sr) : secondAddress; workerInterfaces[secondAddress] = workerInterf; } } @@ -8569,38 +8842,28 @@ Future DatabaseContext::initSharedState() { } void DatabaseContext::setSharedState(DatabaseSharedState* p) { - ASSERT(p->protocolVersion == currentProtocolVersion); + ASSERT(p->protocolVersion == currentProtocolVersion()); sharedStatePtr = p; sharedStatePtr->refCount++; } ACTOR Future storageFeedVersionUpdater(StorageServerInterface interf, ChangeFeedStorageData* self) { - state Promise destroyed = self->destroyed; loop { - if (destroyed.isSet()) { - return Void(); - } if (self->version.get() < self->desired.get()) { wait(delay(CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) || self->version.whenAtLeast(self->desired.get())); - if (destroyed.isSet()) { - return Void(); - } if (self->version.get() < self->desired.get()) { try { ChangeFeedVersionUpdateReply rep = wait(brokenPromiseToNever( interf.changeFeedVersionUpdate.getReply(ChangeFeedVersionUpdateRequest(self->desired.get())))); - if (rep.version > self->version.get()) { self->version.set(rep.version); } } catch (Error& e) { - if (e.code() == error_code_server_overloaded) { - if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) { - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME)); - } - } else { - throw e; + if (e.code() != error_code_server_overloaded) { + throw; + } + if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) { + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME)); } } } @@ -8619,10 +8882,53 @@ Reference DatabaseContext::getStorageData(StorageServerIn newStorageUpdater->id = interf.id(); newStorageUpdater->interfToken = token; newStorageUpdater->updater = storageFeedVersionUpdater(interf, newStorageUpdater.getPtr()); - changeFeedUpdaters[token] = newStorageUpdater; + newStorageUpdater->context = this; + newStorageUpdater->created = now(); + changeFeedUpdaters[token] = newStorageUpdater.getPtr(); return newStorageUpdater; } - return it->second; + return Reference::addRef(it->second); +} + +Version DatabaseContext::getMinimumChangeFeedVersion() { + Version minVersion = std::numeric_limits::max(); + for (auto& it : changeFeedUpdaters) { + if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) { + minVersion = std::min(minVersion, it.second->version.get()); + } + } + for (auto& it : notAtLatestChangeFeeds) { + if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) { + minVersion = std::min(minVersion, it.second->getVersion()); + } + } + return minVersion; +} + +void DatabaseContext::setDesiredChangeFeedVersion(Version v) { + for (auto& it : changeFeedUpdaters) { + if (it.second->version.get() < v && it.second->desired.get() < v) { + it.second->desired.set(v); + } + } +} + +ChangeFeedStorageData::~ChangeFeedStorageData() { + if (context) { + context->changeFeedUpdaters.erase(interfToken); + } +} + +ChangeFeedData::ChangeFeedData(DatabaseContext* context) + : dbgid(deterministicRandom()->randomUniqueID()), context(context), notAtLatest(1), created(now()) { + if (context) { + context->notAtLatestChangeFeeds[dbgid] = this; + } +} +ChangeFeedData::~ChangeFeedData() { + if (context) { + context->notAtLatestChangeFeeds.erase(dbgid); + } } Version ChangeFeedData::getVersion() { @@ -8816,6 +9122,9 @@ ACTOR Future partialChangeFeedStream(StorageServerInterface interf, if (refresh.canBeSet() && !atLatestVersion && rep.atLatestVersion) { atLatestVersion = true; feedData->notAtLatest.set(feedData->notAtLatest.get() - 1); + if (feedData->notAtLatest.get() == 0 && feedData->context) { + feedData->context->notAtLatestChangeFeeds.erase(feedData->dbgid); + } } if (refresh.canBeSet() && rep.minStreamVersion > storageData->version.get()) { storageData->version.set(rep.minStreamVersion); @@ -8980,8 +9289,8 @@ ACTOR Future mergeChangeFeedStream(Reference db, state std::vector> onErrors(interfs.size()); state std::vector streams(interfs.size()); - TEST(interfs.size() > 10); // Large change feed merge cursor - TEST(interfs.size() > 100); // Very large change feed merge cursor + CODE_PROBE(interfs.size() > 10, "Large change feed merge cursor"); + CODE_PROBE(interfs.size() > 100, "Very large change feed merge cursor"); state UID mergeCursorUID = UID(); state std::vector debugUIDs; @@ -9006,11 +9315,6 @@ ACTOR Future mergeChangeFeedStream(Reference db, results->streams.push_back(it.first.changeFeedStream.getReplyStream(req)); } - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->maxSeenVersion = invalidVersion; results->storageData.clear(); Promise refresh = results->refresh; @@ -9019,6 +9323,10 @@ ACTOR Future mergeChangeFeedStream(Reference db, results->storageData.push_back(db->getStorageData(interfs[i].first)); } results->notAtLatest.set(interfs.size()); + if (results->context) { + results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); + } refresh.send(Void()); for (int i = 0; i < interfs.size(); i++) { @@ -9061,6 +9369,8 @@ ACTOR Future getChangeFeedRange(Reference db, Databas loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); Version readVer = wait(tr.getReadVersion()); if (readVer < begin) { wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); @@ -9107,10 +9417,21 @@ ACTOR Future singleChangeFeedStreamInternal(KeyRange range, // update lastReturned once the previous mutation has been consumed if (*begin - 1 > results->lastReturnedVersion.get()) { results->lastReturnedVersion.set(*begin - 1); + if (!refresh.canBeSet()) { + try { + // refresh is set if and only if this actor is cancelled + wait(Future(Void())); + // Catch any unexpected behavior if the above contract is broken + ASSERT(false); + } catch (Error& e) { + ASSERT(e.code() == error_code_actor_cancelled); + throw; + } + } } loop { - + ASSERT(refresh.canBeSet()); state ChangeFeedStreamReply feedReply = waitNext(results->streams[0].getFuture()); *begin = feedReply.mutations.back().version + 1; @@ -9160,6 +9481,9 @@ ACTOR Future singleChangeFeedStreamInternal(KeyRange range, if (!atLatest && feedReply.atLatestVersion) { atLatest = true; results->notAtLatest.set(0); + if (results->context) { + results->context->notAtLatestChangeFeeds.erase(results->dbgid); + } } if (feedReply.minStreamVersion > results->storageData[0]->version.get()) { @@ -9198,11 +9522,6 @@ ACTOR Future singleChangeFeedStream(Reference db, results->streams.clear(); - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.push_back(interf.changeFeedStream.getReplyStream(req)); results->maxSeenVersion = invalidVersion; @@ -9211,6 +9530,10 @@ ACTOR Future singleChangeFeedStream(Reference db, Promise refresh = results->refresh; results->refresh = Promise(); results->notAtLatest.set(1); + if (results->context) { + results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); + } refresh.send(Void()); wait(results->streams[0].onError() || singleChangeFeedStreamInternal(range, results, rangeID, begin, end)); @@ -9228,6 +9551,7 @@ ACTOR Future getChangeFeedStreamActor(Reference db, bool canReadPopped) { state Database cx(db); state Span span("NAPI:GetChangeFeedStream"_loc); + db->usedAnyChangeFeeds = true; results->endVersion = end; @@ -9242,7 +9566,7 @@ ACTOR Future getChangeFeedStreamActor(Reference db, keys = fullRange & range; state std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, CLIENT_KNOBS->CHANGE_FEED_LOCATION_LIMIT, Reverse::False, @@ -9279,6 +9603,10 @@ ACTOR Future getChangeFeedStreamActor(Reference db, if (useIdx >= 0) { chosenLocations[loc] = useIdx; loc++; + if (g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.01)) { + // simulate as if we had to wait for all alternatives delayed, before the next one + wait(delay(deterministicRandom()->random01())); + } continue; } @@ -9299,19 +9627,22 @@ ACTOR Future getChangeFeedStreamActor(Reference db, loc = 0; } + ++db->feedStreamStarts; + if (locations.size() > 1) { + ++db->feedMergeStreamStarts; std::vector> interfs; for (int i = 0; i < locations.size(); i++) { interfs.emplace_back(locations[i].locations->getInterface(chosenLocations[i]), locations[i].range & range); } - TEST(true); // Change feed merge cursor + CODE_PROBE(true, "Change feed merge cursor"); // TODO (jslocum): validate connectionFileChanged behavior wait( mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) || cx->connectionFileChanged()); } else { - TEST(true); // Change feed single cursor + CODE_PROBE(true, "Change feed single cursor"); StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]); wait(singleChangeFeedStream( db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) || @@ -9319,15 +9650,11 @@ ACTOR Future getChangeFeedStreamActor(Reference db, } } catch (Error& e) { if (e.code() == error_code_actor_cancelled || e.code() == error_code_change_feed_popped) { - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.clear(); results->storageData.clear(); if (e.code() == error_code_change_feed_popped) { - TEST(true); // getChangeFeedStreamActor got popped + ++db->feedNonRetriableErrors; + CODE_PROBE(true, "getChangeFeedStreamActor got popped"); results->mutations.sendError(e); results->refresh.sendError(e); } else { @@ -9337,29 +9664,40 @@ ACTOR Future getChangeFeedStreamActor(Reference db, } if (results->notAtLatest.get() == 0) { results->notAtLatest.set(1); + if (results->context) { + results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); + } } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_connection_failed || e.code() == error_code_unknown_change_feed || - e.code() == error_code_broken_promise) { + e.code() == error_code_broken_promise || e.code() == error_code_future_version || + e.code() == error_code_request_maybe_delivered || + e.code() == error_code_storage_too_many_feed_streams) { + ++db->feedErrors; db->changeFeedCache.erase(rangeID); cx->invalidateCache(Key(), keys); - if (begin == lastBeginVersion) { + if (begin == lastBeginVersion || e.code() == error_code_storage_too_many_feed_streams) { // We didn't read anything since the last failure before failing again. - // Do exponential backoff, up to 1 second - sleepWithBackoff = std::min(1.0, sleepWithBackoff * 1.5); + // Back off quickly and exponentially, up to 1 second + sleepWithBackoff = std::min(2.0, sleepWithBackoff * 5); + sleepWithBackoff = std::max(0.1, sleepWithBackoff); } else { sleepWithBackoff = CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY; } + TraceEvent("ChangeFeedClientError") + .errorUnsuppressed(e) + .suppressFor(30.0) + .detail("AnyProgress", begin != lastBeginVersion); wait(delay(sleepWithBackoff)); } else { + if (e.code() != error_code_end_of_stream) { + ++db->feedNonRetriableErrors; + TraceEvent("ChangeFeedClientErrorNonRetryable").errorUnsuppressed(e).suppressFor(5.0); + } results->mutations.sendError(e); results->refresh.sendError(change_feed_cancelled()); - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.clear(); results->storageData.clear(); return Void(); @@ -9379,11 +9717,20 @@ Future DatabaseContext::getChangeFeedStream(Reference resu Reference::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped); } -ACTOR Future> singleLocationOverlappingChangeFeeds( - Database cx, - Reference location, - KeyRangeRef range, - Version minVersion) { +Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const { + Version v = invalidVersion; + for (auto& it : feedMetadataVersions) { + if (it.second > v && it.first.intersects(range)) { + v = it.second; + } + } + return v; +} + +ACTOR Future singleLocationOverlappingChangeFeeds(Database cx, + Reference location, + KeyRangeRef range, + Version minVersion) { state OverlappingChangeFeedsRequest req; req.range = range; req.minVersion = minVersion; @@ -9395,16 +9742,16 @@ ACTOR Future> singleLocationOverlappingC TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); - return rep.rangeIds; + return rep; } bool compareChangeFeedResult(const OverlappingChangeFeedEntry& i, const OverlappingChangeFeedEntry& j) { - return i.rangeId < j.rangeId; + return i.feedId < j.feedId; } -ACTOR Future> getOverlappingChangeFeedsActor(Reference db, - KeyRangeRef range, - Version minVersion) { +ACTOR Future getOverlappingChangeFeedsActor(Reference db, + KeyRangeRef range, + Version minVersion) { state Database cx(db); state Span span("NAPI:GetOverlappingChangeFeeds"_loc); @@ -9412,7 +9759,7 @@ ACTOR Future> getOverlappingChangeFeedsA try { state std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), range, CLIENT_KNOBS->CHANGE_FEED_LOCATION_LIMIT, Reverse::False, @@ -9430,22 +9777,37 @@ ACTOR Future> getOverlappingChangeFeedsA throw all_alternatives_failed(); } - state std::vector>> allOverlappingRequests; + state std::vector> allOverlappingRequests; for (auto& it : locations) { allOverlappingRequests.push_back( singleLocationOverlappingChangeFeeds(cx, it.locations, it.range & range, minVersion)); } wait(waitForAll(allOverlappingRequests)); - std::vector result; - for (auto& it : allOverlappingRequests) { - result.insert(result.end(), it.get().begin(), it.get().end()); + OverlappingChangeFeedsInfo result; + std::unordered_map latestFeedMetadata; + for (int i = 0; i < locations.size(); i++) { + result.arena.dependsOn(allOverlappingRequests[i].get().arena); + result.arena.dependsOn(locations[i].range.arena()); + result.feedMetadataVersions.push_back( + { locations[i].range, allOverlappingRequests[i].get().feedMetadataVersion }); + for (auto& it : allOverlappingRequests[i].get().feeds) { + auto res = latestFeedMetadata.insert({ it.feedId, it }); + if (!res.second) { + CODE_PROBE(true, "deduping fetched overlapping feed by higher metadata version"); + if (res.first->second.feedMetadataVersion < it.feedMetadataVersion) { + res.first->second = it; + } + } + } + } + for (auto& it : latestFeedMetadata) { + result.feeds.push_back(result.arena, it.second); } - std::sort(result.begin(), result.end(), compareChangeFeedResult); - result.resize(std::unique(result.begin(), result.end()) - result.begin()); return result; } catch (Error& e) { - if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || + e.code() == error_code_future_version) { cx->invalidateCache(Key(), range); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); } else { @@ -9455,16 +9817,18 @@ ACTOR Future> getOverlappingChangeFeedsA } } -Future> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, - Version minVersion) { +Future DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, Version minVersion) { return getOverlappingChangeFeedsActor(Reference::addRef(this), range, minVersion); } ACTOR static Future popChangeFeedBackup(Database cx, Key rangeID, Version version) { + ++cx->feedPopsFallback; state Transaction tr(cx); loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); Optional val = wait(tr.get(rangeIDKey)); if (val.present()) { @@ -9496,12 +9860,14 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke state Database cx(db); state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); state Span span("NAPI:PopChangeFeedMutations"_loc); + db->usedAnyChangeFeeds = true; + ++db->feedPops; state KeyRange keys = wait(getChangeFeedRange(db, cx, rangeID)); state std::vector locations = wait(getKeyRangeLocations(cx, - Optional(), + TenantInfo(), keys, 3, Reverse::False, @@ -9568,6 +9934,7 @@ Reference DatabaseContext::createTransaction() { return makeReference(Database(Reference::addRef(this))); } +// BlobGranule API. ACTOR Future purgeBlobGranulesActor(Reference db, KeyRange range, Version purgeVersion, @@ -9579,8 +9946,21 @@ ACTOR Future purgeBlobGranulesActor(Reference db, state KeyRange purgeRange = range; state bool loadedTenantPrefix = false; - // FIXME: implement force - if (!force) { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + if (purgeVersion == latestVersion) { + loop { + try { + Version _purgeVersion = wait(tr.getReadVersion()); + purgeVersion = _purgeVersion; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + tr.reset(); + } + if (purgeVersion <= 0) { + TraceEvent("PurgeInvalidVersion").detail("Range", range).detail("Version", purgeVersion).detail("Force", force); throw unsupported_operation(); } @@ -9588,6 +9968,7 @@ ACTOR Future purgeBlobGranulesActor(Reference db, try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); if (tenant.present() && !loadedTenantPrefix) { TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin)); @@ -9595,6 +9976,18 @@ ACTOR Future purgeBlobGranulesActor(Reference db, purgeRange = purgeRange.withPrefix(tenantEntry.prefix); } + // must be aligned to blob range(s) + state Future> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin)); + state Future> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin)); + wait(success(beginPresent) && success(endPresent)); + if (!beginPresent.get().present() || !endPresent.get().present()) { + TraceEvent("UnalignedPurge") + .detail("Range", range) + .detail("Version", purgeVersion) + .detail("Force", force); + throw unsupported_operation(); + } + Value purgeValue = blobGranulePurgeValueFor(purgeVersion, range, force); tr.atomicOp( addVersionStampAtEnd(blobGranulePurgeKeys.begin), purgeValue, MutationRef::SetVersionstampedKey); @@ -9664,6 +10057,108 @@ Future DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) { return waitPurgeGranulesCompleteActor(Reference::addRef(this), purgeKey); } +ACTOR Future>> getBlobRanges(Reference tr, + KeyRange range, + int batchLimit) { + state Standalone> blobRanges; + state Key beginKey = range.begin; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state RangeResult results = wait( + krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2)); + + blobRanges.arena().dependsOn(results.arena()); + for (int i = 0; i < results.size() - 1; i++) { + if (results[i].value == blobRangeActive) { + blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key)); + } + if (blobRanges.size() == batchLimit) { + return blobRanges; + } + } + + if (!results.more) { + return blobRanges; + } + beginKey = results.back().key; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + +ACTOR Future setBlobRangeActor(Reference cx, KeyRange range, bool active) { + state Database db(cx); + state Reference tr = makeReference(db); + + state Value value = active ? blobRangeActive : blobRangeInactive; + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + Standalone> startBlobRanges = wait(getBlobRanges(tr, range, 1)); + + if (active) { + // Idempotent request. + if (!startBlobRanges.empty()) { + return startBlobRanges.front().begin == range.begin && startBlobRanges.front().end == range.end; + } + } else { + // An unblobbify request must be aligned to boundaries. + // It is okay to unblobbify multiple regions all at once. + if (startBlobRanges.empty()) { + // already unblobbified + return true; + } else if (startBlobRanges.front().begin != range.begin) { + // If there is a blob at the beginning of the range and it isn't aligned + return false; + } + // if blob range does start at the specified, key, we need to make sure the end of also a boundary of a + // blob range + Optional endPresent = wait(tr->get(range.end.withPrefix(blobRangeKeys.begin))); + if (!endPresent.present()) { + return false; + } + } + + tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString()); + // This is not coalescing because we want to keep each range logically separate. + wait(krmSetRange(tr, blobRangeKeys.begin, range, value)); + wait(tr->commit()); + return true; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + +Future DatabaseContext::blobbifyRange(KeyRange range) { + return setBlobRangeActor(Reference::addRef(this), range, true); +} + +Future DatabaseContext::unblobbifyRange(KeyRange range) { + return setBlobRangeActor(Reference::addRef(this), range, false); +} + +ACTOR Future>> listBlobbifiedRangesActor(Reference cx, + KeyRange range, + int rangeLimit) { + state Database db(cx); + state Reference tr = makeReference(db); + + state Standalone> blobRanges = wait(getBlobRanges(tr, range, rangeLimit)); + + return blobRanges; +} + +Future>> DatabaseContext::listBlobbifiedRanges(KeyRange range, int rowLimit) { + return listBlobbifiedRangesActor(Reference::addRef(this), range, rowLimit); +} + int64_t getMaxKeySize(KeyRef const& key) { return getMaxWriteKeySize(key, true); } @@ -9673,7 +10168,7 @@ int64_t getMaxReadKeySize(KeyRef const& key) { } int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess) { - int64_t tenantSize = hasRawAccess ? CLIENT_KNOBS->TENANT_PREFIX_SIZE_LIMIT : 0; + int64_t tenantSize = hasRawAccess ? TenantMapEntry::PREFIX_SIZE : 0; return key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT + tenantSize; } diff --git a/fdbclient/PaxosConfigTransaction.actor.cpp b/fdbclient/PaxosConfigTransaction.actor.cpp index 4b7c19c05a..b940aced7f 100644 --- a/fdbclient/PaxosConfigTransaction.actor.cpp +++ b/fdbclient/PaxosConfigTransaction.actor.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/DatabaseContext.h" +#include "fdbclient/MonitorLeader.h" #include "fdbclient/PaxosConfigTransaction.h" #include "flow/actorcompiler.h" // must be last include @@ -34,8 +35,9 @@ class CommitQuorum { Standalone> mutations; ConfigCommitAnnotation annotation; - ConfigTransactionCommitRequest getCommitRequest(ConfigGeneration generation) const { - return ConfigTransactionCommitRequest(generation, mutations, annotation); + ConfigTransactionCommitRequest getCommitRequest(ConfigGeneration generation, + CoordinatorsHash coordinatorsHash) const { + return ConfigTransactionCommitRequest(coordinatorsHash, generation, mutations, annotation); } void updateResult() { @@ -62,14 +64,16 @@ class CommitQuorum { ACTOR static Future addRequestActor(CommitQuorum* self, ConfigGeneration generation, + CoordinatorsHash coordinatorsHash, ConfigTransactionInterface cti) { try { if (cti.hostname.present()) { - wait(timeoutError(retryGetReplyFromHostname( - self->getCommitRequest(generation), cti.hostname.get(), WLTOKEN_CONFIGTXN_COMMIT), + wait(timeoutError(retryGetReplyFromHostname(self->getCommitRequest(generation, coordinatorsHash), + cti.hostname.get(), + WLTOKEN_CONFIGTXN_COMMIT), CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT)); } else { - wait(timeoutError(cti.commit.getReply(self->getCommitRequest(generation)), + wait(timeoutError(cti.commit.getReply(self->getCommitRequest(generation, coordinatorsHash)), CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT)); } ++self->successful; @@ -109,11 +113,11 @@ public: } void setTimestamp() { annotation.timestamp = now(); } size_t expectedSize() const { return annotation.expectedSize() + mutations.expectedSize(); } - Future commit(ConfigGeneration generation) { + Future commit(ConfigGeneration generation, CoordinatorsHash coordinatorsHash) { // Send commit message to all replicas, even those that did not return the used replica. // This way, slow replicas are kept up date. for (const auto& cti : ctis) { - actors.add(addRequestActor(this, generation, cti)); + actors.add(addRequestActor(this, generation, coordinatorsHash, cti)); } return result.getFuture(); } @@ -122,11 +126,13 @@ public: class GetGenerationQuorum { ActorCollection actors{ false }; + CoordinatorsHash coordinatorsHash{ 0 }; std::vector ctis; std::map> seenGenerations; Promise result; size_t totalRepliesReceived{ 0 }; size_t maxAgreement{ 0 }; + Future coordinatorsChangedFuture; Optional lastSeenLiveVersion; Future getGenerationFuture; @@ -137,14 +143,15 @@ class GetGenerationQuorum { if (cti.hostname.present()) { wait(timeoutError(store(reply, retryGetReplyFromHostname( - ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion }, + ConfigTransactionGetGenerationRequest{ self->coordinatorsHash, + self->lastSeenLiveVersion }, cti.hostname.get(), WLTOKEN_CONFIGTXN_GETGENERATION)), CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT)); } else { wait(timeoutError(store(reply, - cti.getGeneration.getReply( - ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion })), + cti.getGeneration.getReply(ConfigTransactionGetGenerationRequest{ + self->coordinatorsHash, self->lastSeenLiveVersion })), CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT)); } @@ -155,6 +162,14 @@ class GetGenerationQuorum { auto& replicas = self->seenGenerations[gen]; replicas.push_back(cti); self->maxAgreement = std::max(replicas.size(), self->maxAgreement); + // TraceEvent("ConfigTransactionGotGenerationReply") + // .detail("From", cti.getGeneration.getEndpoint().getPrimaryAddress()) + // .detail("TotalRepliesReceived", self->totalRepliesReceived) + // .detail("ReplyGeneration", gen.toString()) + // .detail("Replicas", replicas.size()) + // .detail("Coordinators", self->ctis.size()) + // .detail("MaxAgreement", self->maxAgreement) + // .detail("LastSeenLiveVersion", self->lastSeenLiveVersion); if (replicas.size() >= self->ctis.size() / 2 + 1 && !self->result.isSet()) { self->result.send(gen); } else if (self->maxAgreement + (self->ctis.size() - self->totalRepliesReceived) < @@ -199,9 +214,19 @@ class GetGenerationQuorum { } } catch (Error& e) { if (e.code() == error_code_failed_to_reach_quorum) { - TEST(true); // Failed to reach quorum getting generation - wait(delayJittered( - std::clamp(0.005 * (1 << retries), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND))); + CODE_PROBE(true, "Failed to reach quorum getting generation"); + if (self->coordinatorsChangedFuture.isReady()) { + throw coordinators_changed(); + } + wait(delayJittered(std::clamp( + 0.005 * (1 << std::min(retries, 30)), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND))); + if (deterministicRandom()->random01() < 0.05) { + // Randomly inject a delay of at least the generation + // reply timeout, to try to prevent contention between + // clients. + wait(delay(CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT * + (deterministicRandom()->random01() + 1.0))); + } ++retries; self->actors.clear(false); self->seenGenerations.clear(); @@ -217,9 +242,12 @@ class GetGenerationQuorum { public: GetGenerationQuorum() = default; - explicit GetGenerationQuorum(std::vector const& ctis, + explicit GetGenerationQuorum(CoordinatorsHash coordinatorsHash, + std::vector const& ctis, + Future coordinatorsChangedFuture, Optional const& lastSeenLiveVersion = {}) - : ctis(ctis), lastSeenLiveVersion(lastSeenLiveVersion) {} + : coordinatorsHash(coordinatorsHash), ctis(ctis), coordinatorsChangedFuture(coordinatorsChangedFuture), + lastSeenLiveVersion(lastSeenLiveVersion) {} Future getGeneration() { if (!getGenerationFuture.isValid()) { getGenerationFuture = getGenerationActor(this); @@ -240,12 +268,14 @@ public: }; class PaxosConfigTransactionImpl { + CoordinatorsHash coordinatorsHash{ 0 }; std::vector ctis; GetGenerationQuorum getGenerationQuorum; CommitQuorum commitQuorum; int numRetries{ 0 }; Optional dID; Database cx; + Future watchClusterFileFuture; ACTOR static Future> get(PaxosConfigTransactionImpl* self, Key key) { state ConfigKey configKey = ConfigKey::decodeKey(key); @@ -263,18 +293,19 @@ class PaxosConfigTransactionImpl { } wait(waitForAll(fs)); state Reference configNodes(new ConfigTransactionInfo(readReplicas)); - ConfigTransactionGetReply reply = - wait(timeoutError(basicLoadBalance(configNodes, - &ConfigTransactionInterface::get, - ConfigTransactionGetRequest{ generation, configKey }), - CLIENT_KNOBS->GET_KNOB_TIMEOUT)); + ConfigTransactionGetReply reply = wait(timeoutError( + basicLoadBalance(configNodes, + &ConfigTransactionInterface::get, + ConfigTransactionGetRequest{ self->coordinatorsHash, generation, configKey }), + CLIENT_KNOBS->GET_KNOB_TIMEOUT)); if (reply.value.present()) { return reply.value.get().toValue(); } else { return Optional{}; } } catch (Error& e) { - if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) { + if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise && + e.code() != error_code_coordinators_changed) { throw; } self->reset(); @@ -283,58 +314,87 @@ class PaxosConfigTransactionImpl { } ACTOR static Future getConfigClasses(PaxosConfigTransactionImpl* self) { - state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - state std::vector readReplicas = self->getGenerationQuorum.getReadReplicas(); - std::vector> fs; - for (ConfigTransactionInterface& readReplica : readReplicas) { - if (readReplica.hostname.present()) { - fs.push_back(tryInitializeRequestStream( - &readReplica.getClasses, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES)); + loop { + try { + state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); + state std::vector readReplicas = + self->getGenerationQuorum.getReadReplicas(); + std::vector> fs; + for (ConfigTransactionInterface& readReplica : readReplicas) { + if (readReplica.hostname.present()) { + fs.push_back(tryInitializeRequestStream( + &readReplica.getClasses, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES)); + } + } + wait(waitForAll(fs)); + state Reference configNodes(new ConfigTransactionInfo(readReplicas)); + ConfigTransactionGetConfigClassesReply reply = wait( + basicLoadBalance(configNodes, + &ConfigTransactionInterface::getClasses, + ConfigTransactionGetConfigClassesRequest{ self->coordinatorsHash, generation })); + RangeResult result; + result.reserve(result.arena(), reply.configClasses.size()); + for (const auto& configClass : reply.configClasses) { + result.push_back_deep(result.arena(), KeyValueRef(configClass, ""_sr)); + } + return result; + } catch (Error& e) { + if (e.code() != error_code_coordinators_changed) { + throw; + } + self->reset(); } } - wait(waitForAll(fs)); - state Reference configNodes(new ConfigTransactionInfo(readReplicas)); - ConfigTransactionGetConfigClassesReply reply = - wait(basicLoadBalance(configNodes, - &ConfigTransactionInterface::getClasses, - ConfigTransactionGetConfigClassesRequest{ generation })); - RangeResult result; - result.reserve(result.arena(), reply.configClasses.size()); - for (const auto& configClass : reply.configClasses) { - result.push_back_deep(result.arena(), KeyValueRef(configClass, ""_sr)); - } - return result; } ACTOR static Future getKnobs(PaxosConfigTransactionImpl* self, Optional configClass) { - state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - state std::vector readReplicas = self->getGenerationQuorum.getReadReplicas(); - std::vector> fs; - for (ConfigTransactionInterface& readReplica : readReplicas) { - if (readReplica.hostname.present()) { - fs.push_back(tryInitializeRequestStream( - &readReplica.getKnobs, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS)); + loop { + try { + state ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); + state std::vector readReplicas = + self->getGenerationQuorum.getReadReplicas(); + std::vector> fs; + for (ConfigTransactionInterface& readReplica : readReplicas) { + if (readReplica.hostname.present()) { + fs.push_back(tryInitializeRequestStream( + &readReplica.getKnobs, readReplica.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS)); + } + } + wait(waitForAll(fs)); + state Reference configNodes(new ConfigTransactionInfo(readReplicas)); + ConfigTransactionGetKnobsReply reply = wait(basicLoadBalance( + configNodes, + &ConfigTransactionInterface::getKnobs, + ConfigTransactionGetKnobsRequest{ self->coordinatorsHash, generation, configClass })); + RangeResult result; + result.reserve(result.arena(), reply.knobNames.size()); + for (const auto& knobName : reply.knobNames) { + result.push_back_deep(result.arena(), KeyValueRef(knobName, ""_sr)); + } + return result; + } catch (Error& e) { + if (e.code() != error_code_coordinators_changed) { + throw; + } + self->reset(); } } - wait(waitForAll(fs)); - state Reference configNodes(new ConfigTransactionInfo(readReplicas)); - ConfigTransactionGetKnobsReply reply = - wait(basicLoadBalance(configNodes, - &ConfigTransactionInterface::getKnobs, - ConfigTransactionGetKnobsRequest{ generation, configClass })); - RangeResult result; - result.reserve(result.arena(), reply.knobNames.size()); - for (const auto& knobName : reply.knobNames) { - result.push_back_deep(result.arena(), KeyValueRef(knobName, ""_sr)); - } - return result; } ACTOR static Future commit(PaxosConfigTransactionImpl* self) { - ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - self->commitQuorum.setTimestamp(); - wait(self->commitQuorum.commit(generation)); - return Void(); + loop { + try { + ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); + self->commitQuorum.setTimestamp(); + wait(self->commitQuorum.commit(generation, self->coordinatorsHash)); + return Void(); + } catch (Error& e) { + if (e.code() != error_code_coordinators_changed) { + throw; + } + self->reset(); + } + } } ACTOR static Future onError(PaxosConfigTransactionImpl* self, Error e) { @@ -350,6 +410,20 @@ class PaxosConfigTransactionImpl { throw e; } + // Returns when the cluster interface updates with a new connection string. + ACTOR static Future watchClusterFile(Database cx) { + state Future leaderMonitor = + monitorLeader(cx->getConnectionRecord(), cx->statusClusterInterface); + state std::string connectionString = cx->getConnectionRecord()->getConnectionString().toString(); + + loop { + wait(cx->statusClusterInterface->onChange()); + if (cx->getConnectionRecord()->getConnectionString().toString() != connectionString) { + return Void(); + } + } + } + public: Future getReadVersion() { return map(getGenerationQuorum.getGeneration(), [](auto const& gen) { return gen.committedVersion; }); @@ -395,7 +469,25 @@ public: void debugTransaction(UID dID) { this->dID = dID; } void reset() { - getGenerationQuorum = GetGenerationQuorum{ ctis }; + ctis.clear(); + // Re-read connection string. If the cluster file changed, this will + // return the updated value. + const ClusterConnectionString& cs = cx->getConnectionRecord()->getConnectionString(); + ctis.reserve(cs.hostnames.size() + cs.coords.size()); + for (const auto& h : cs.hostnames) { + ctis.emplace_back(h); + } + for (const auto& c : cs.coords) { + ctis.emplace_back(c); + } + coordinatorsHash = std::hash()(cx->getConnectionRecord()->getConnectionString().toString()); + if (!cx->statusLeaderMon.isValid() || cx->statusLeaderMon.isReady()) { + cx->statusClusterInterface = makeReference>>(); + cx->statusLeaderMon = watchClusterFile(cx); + } + getGenerationQuorum = GetGenerationQuorum{ + coordinatorsHash, ctis, cx->statusLeaderMon, getGenerationQuorum.getLastSeenLiveVersion() + }; commitQuorum = CommitQuorum{ ctis }; } @@ -416,21 +508,10 @@ public: Future commit() { return commit(this); } - PaxosConfigTransactionImpl(Database const& cx) : cx(cx) { - const ClusterConnectionString& cs = cx->getConnectionRecord()->getConnectionString(); - ctis.reserve(cs.hostnames.size() + cs.coords.size()); - for (const auto& h : cs.hostnames) { - ctis.emplace_back(h); - } - for (const auto& c : cs.coords) { - ctis.emplace_back(c); - } - getGenerationQuorum = GetGenerationQuorum{ ctis }; - commitQuorum = CommitQuorum{ ctis }; - } + PaxosConfigTransactionImpl(Database const& cx) : cx(cx) { reset(); } PaxosConfigTransactionImpl(std::vector const& ctis) - : ctis(ctis), getGenerationQuorum(ctis), commitQuorum(ctis) {} + : ctis(ctis), getGenerationQuorum(0, ctis, Future()), commitQuorum(ctis) {} }; Future PaxosConfigTransaction::getReadVersion() { diff --git a/fdbclient/RESTClient.actor.cpp b/fdbclient/RESTClient.actor.cpp index 015ed1a8b0..e0cb416dec 100644 --- a/fdbclient/RESTClient.actor.cpp +++ b/fdbclient/RESTClient.actor.cpp @@ -20,7 +20,7 @@ #include "fdbclient/RESTClient.h" -#include "fdbclient/HTTP.h" +#include "fdbrpc/HTTP.h" #include "flow/IRateControl.h" #include "fdbclient/RESTUtils.h" #include "flow/Arena.h" diff --git a/fdbclient/RYWIterator.cpp b/fdbclient/RYWIterator.cpp index 949f164485..3966df3748 100644 --- a/fdbclient/RYWIterator.cpp +++ b/fdbclient/RYWIterator.cpp @@ -231,28 +231,28 @@ void testSnapshotCache() { WriteMap writes(&arena); Standalone> keys; - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("d"), LiteralStringRef("doo"))); - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("e"), LiteralStringRef("eoo"))); - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("e\x00"), LiteralStringRef("zoo"))); - keys.push_back_deep(keys.arena(), KeyValueRef(LiteralStringRef("f"), LiteralStringRef("foo"))); - cache.insert(KeyRangeRef(LiteralStringRef("d"), LiteralStringRef("f\x00")), keys); + keys.push_back_deep(keys.arena(), KeyValueRef("d"_sr, "doo"_sr)); + keys.push_back_deep(keys.arena(), KeyValueRef("e"_sr, "eoo"_sr)); + keys.push_back_deep(keys.arena(), KeyValueRef("e\x00"_sr, "zoo"_sr)); + keys.push_back_deep(keys.arena(), KeyValueRef("f"_sr, "foo"_sr)); + cache.insert(KeyRangeRef("d"_sr, "f\x00"_sr), keys); - cache.insert(KeyRangeRef(LiteralStringRef("g"), LiteralStringRef("h")), Standalone>()); + cache.insert(KeyRangeRef("g"_sr, "h"_sr), Standalone>()); Standalone> keys2; - keys2.push_back_deep(keys2.arena(), KeyValueRef(LiteralStringRef("k"), LiteralStringRef("koo"))); - keys2.push_back_deep(keys2.arena(), KeyValueRef(LiteralStringRef("l"), LiteralStringRef("loo"))); - cache.insert(KeyRangeRef(LiteralStringRef("j"), LiteralStringRef("m")), keys2); + keys2.push_back_deep(keys2.arena(), KeyValueRef("k"_sr, "koo"_sr)); + keys2.push_back_deep(keys2.arena(), KeyValueRef("l"_sr, "loo"_sr)); + cache.insert(KeyRangeRef("j"_sr, "m"_sr), keys2); - writes.mutate(LiteralStringRef("c"), MutationRef::SetValue, LiteralStringRef("c--"), true); - writes.clear(KeyRangeRef(LiteralStringRef("c\x00"), LiteralStringRef("e")), true); - writes.mutate(LiteralStringRef("c\x00"), MutationRef::SetValue, LiteralStringRef("c00--"), true); + writes.mutate("c"_sr, MutationRef::SetValue, "c--"_sr, true); + writes.clear(KeyRangeRef("c\x00"_sr, "e"_sr), true); + writes.mutate("c\x00"_sr, MutationRef::SetValue, "c00--"_sr, true); WriteMap::iterator it3(&writes); - writes.mutate(LiteralStringRef("d"), MutationRef::SetValue, LiteralStringRef("d--"), true); - writes.mutate(LiteralStringRef("e"), MutationRef::SetValue, LiteralStringRef("e++"), true); - writes.mutate(LiteralStringRef("i"), MutationRef::SetValue, LiteralStringRef("i--"), true); + writes.mutate("d"_sr, MutationRef::SetValue, "d--"_sr, true); + writes.mutate("e"_sr, MutationRef::SetValue, "e++"_sr, true); + writes.mutate("i"_sr, MutationRef::SetValue, "i--"_sr, true); - KeyRange searchKeys = KeyRangeRef(LiteralStringRef("a"), LiteralStringRef("z")); + KeyRange searchKeys = KeyRangeRef("a"_sr, "z"_sr); RYWIterator it(&cache, &writes); it.skip(searchKeys.begin); @@ -425,7 +425,7 @@ TEST_CASE("/fdbclient/WriteMap/emptiness") { Arena arena = Arena(); WriteMap writes = WriteMap(&arena); ASSERT(writes.empty()); - writes.mutate(LiteralStringRef("apple"), MutationRef::SetValue, LiteralStringRef("red"), true); + writes.mutate("apple"_sr, MutationRef::SetValue, "red"_sr, true); ASSERT(!writes.empty()); return Void(); } @@ -457,11 +457,11 @@ TEST_CASE("/fdbclient/WriteMap/clear") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("apple"), MutationRef::SetValue, LiteralStringRef("red"), true); + writes.mutate("apple"_sr, MutationRef::SetValue, "red"_sr, true); ASSERT(!writes.empty()); ASSERT(getWriteMapCount(&writes) == 3); - KeyRangeRef range = KeyRangeRef(LiteralStringRef("a"), LiteralStringRef("j")); + KeyRangeRef range = KeyRangeRef("a"_sr, "j"_sr); writes.clear(range, true); ASSERT(getWriteMapCount(&writes) == 3); @@ -474,22 +474,19 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00"), - MutationRef::SetVersionstampedKey, - LiteralStringRef("1"), - true); + writes.mutate("stamp:XXXXXXXX\x06\x00\x00\x00"_sr, MutationRef::SetVersionstampedKey, "1"_sr, true); ASSERT(!writes.empty()); ASSERT(getWriteMapCount(&writes) == 3); - writes.mutate(LiteralStringRef("stamp:ZZZZZZZZZZ"), MutationRef::AddValue, LiteralStringRef("2"), true); + writes.mutate("stamp:ZZZZZZZZZZ"_sr, MutationRef::AddValue, "2"_sr, true); ASSERT(getWriteMapCount(&writes) == 5); WriteMap::iterator it(&writes); it.skip(allKeys.begin); ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0); + ASSERT(it.beginKey().compare(""_sr) == 0); + ASSERT(it.endKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -498,8 +495,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0); + ASSERT(it.beginKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00"_sr) == 0); + ASSERT(it.endKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -509,8 +506,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:XXXXXXXX\x06\x00\x00\x00\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0); + ASSERT(it.beginKey().compare("stamp:XXXXXXXX\x06\x00\x00\x00\x00"_sr) == 0); + ASSERT(it.endKey().compare("stamp:ZZZZZZZZZZ"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -519,8 +516,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0); + ASSERT(it.beginKey().compare("stamp:ZZZZZZZZZZ"_sr) == 0); + ASSERT(it.endKey().compare("stamp:ZZZZZZZZZZ\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -530,8 +527,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedKey") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp:ZZZZZZZZZZ\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0); + ASSERT(it.beginKey().compare("stamp:ZZZZZZZZZZ\x00"_sr) == 0); + ASSERT(it.endKey().compare("\xff\xff"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -550,22 +547,19 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("stamp"), - MutationRef::SetVersionstampedValue, - LiteralStringRef("XXXXXXXX\x00\x00\x00\x00\x00\x00"), - true); + writes.mutate("stamp"_sr, MutationRef::SetVersionstampedValue, "XXXXXXXX\x00\x00\x00\x00\x00\x00"_sr, true); ASSERT(!writes.empty()); ASSERT(getWriteMapCount(&writes) == 3); - writes.mutate(LiteralStringRef("stamp123"), MutationRef::AddValue, LiteralStringRef("1"), true); + writes.mutate("stamp123"_sr, MutationRef::AddValue, "1"_sr, true); ASSERT(getWriteMapCount(&writes) == 5); WriteMap::iterator it(&writes); it.skip(allKeys.begin); ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp")) == 0); + ASSERT(it.beginKey().compare(""_sr) == 0); + ASSERT(it.endKey().compare("stamp"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -574,8 +568,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp\x00")) == 0); + ASSERT(it.beginKey().compare("stamp"_sr) == 0); + ASSERT(it.endKey().compare("stamp\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -585,8 +579,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp123")) == 0); + ASSERT(it.beginKey().compare("stamp\x00"_sr) == 0); + ASSERT(it.endKey().compare("stamp123"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -595,8 +589,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp123")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("stamp123\x00")) == 0); + ASSERT(it.beginKey().compare("stamp123"_sr) == 0); + ASSERT(it.endKey().compare("stamp123\x00"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(it.is_conflict_range()); ASSERT(it.is_operation()); @@ -606,8 +600,8 @@ TEST_CASE("/fdbclient/WriteMap/setVersionstampedValue") { ++it; ASSERT(it.beginKey() < allKeys.end); - ASSERT(it.beginKey().compare(LiteralStringRef("stamp123\x00")) == 0); - ASSERT(it.endKey().compare(LiteralStringRef("\xff\xff")) == 0); + ASSERT(it.beginKey().compare("stamp123\x00"_sr) == 0); + ASSERT(it.endKey().compare("\xff\xff"_sr) == 0); ASSERT(!it.is_cleared_range()); ASSERT(!it.is_conflict_range()); ASSERT(!it.is_operation()); @@ -626,10 +620,10 @@ TEST_CASE("/fdbclient/WriteMap/addValue") { ASSERT(writes.empty()); ASSERT(getWriteMapCount(&writes) == 1); - writes.mutate(LiteralStringRef("apple123"), MutationRef::SetValue, LiteralStringRef("17"), true); + writes.mutate("apple123"_sr, MutationRef::SetValue, "17"_sr, true); ASSERT(getWriteMapCount(&writes) == 3); - writes.mutate(LiteralStringRef("apple123"), MutationRef::AddValue, LiteralStringRef("1"), true); + writes.mutate("apple123"_sr, MutationRef::AddValue, "1"_sr, true); ASSERT(getWriteMapCount(&writes) == 3); return Void(); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index a64a65e58a..c3016c87e3 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -459,7 +459,7 @@ public: if (!it.is_unreadable() && !it.is_unknown_range() && key.offset > 1) { *readThroughEnd = true; - key.setKey(maxKey); // maxKey is a KeyRef, but points to a LiteralStringRef. TODO: how can we ASSERT this? + key.setKey(maxKey); // maxKey is a KeyRef, but points to a literal. TODO: how can we ASSERT this? key.offset = 1; return; } @@ -681,7 +681,8 @@ public: break; if (it.is_unknown_range()) { - if (limits.hasByteLimit() && result.size() && itemsPastEnd >= 1 - end.offset) { + if (limits.hasByteLimit() && limits.hasSatisfiedMinRows() && result.size() && + itemsPastEnd >= 1 - end.offset) { result.more = true; break; } @@ -1213,7 +1214,7 @@ public: // isolation support. But it is not default and is rarely used. So we disallow it until we have thorough test // coverage for it.) if (snapshot) { - TEST(true); // getMappedRange not supported for snapshot. + CODE_PROBE(true, "getMappedRange not supported for snapshot."); throw unsupported_operation(); } // For now, getMappedRange requires read-your-writes being NOT disabled. But the support of RYW is limited @@ -1222,7 +1223,7 @@ public: // which returns the written value transparently. In another word, it makes sure not break RYW semantics without // actually implementing reading from the writes. if (ryw->options.readYourWritesDisabled) { - TEST(true); // getMappedRange not supported for read-your-writes disabled. + CODE_PROBE(true, "getMappedRange not supported for read-your-writes disabled."); throw unsupported_operation(); } @@ -1242,7 +1243,7 @@ public: ++it; ASSERT(itCopy->value.size()); - TEST(itCopy->value.size() > 1); // Multiple watches on the same key triggered by RYOW + CODE_PROBE(itCopy->value.size() > 1, "Multiple watches on the same key triggered by RYOW"); for (int i = 0; i < itCopy->value.size(); i++) { if (itCopy->value[i]->onChangeTrigger.isSet()) { @@ -1535,15 +1536,15 @@ ACTOR Future getWorkerInterfaces(Reference> ReadYourWritesTransaction::get(const Key& key, Snapshot snapshot) { - TEST(true); // ReadYourWritesTransaction::get + CODE_PROBE(true, "ReadYourWritesTransaction::get"); if (getDatabase()->apiVersionAtLeast(630)) { if (specialKeys.contains(key)) { - TEST(true); // Special keys get + CODE_PROBE(true, "Special keys get"); return getDatabase()->specialKeySpace->get(this, key); } } else { - if (key == LiteralStringRef("\xff\xff/status/json")) { + if (key == "\xff\xff/status/json"_sr) { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { ++tr.getDatabase()->transactionStatusRequests; return getJSON(tr.getDatabase()); @@ -1552,7 +1553,7 @@ Future> ReadYourWritesTransaction::get(const Key& key, Snapshot } } - if (key == LiteralStringRef("\xff\xff/cluster_file_path")) { + if (key == "\xff\xff/cluster_file_path"_sr) { try { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { Optional output = StringRef(tr.getDatabase()->getConnectionRecord()->getLocation()); @@ -1564,7 +1565,7 @@ Future> ReadYourWritesTransaction::get(const Key& key, Snapshot return Optional(); } - if (key == LiteralStringRef("\xff\xff/connection_string")) { + if (key == "\xff\xff/connection_string"_sr) { try { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { Reference f = tr.getDatabase()->getConnectionRecord(); @@ -1622,11 +1623,11 @@ Future ReadYourWritesTransaction::getRange(KeySelector begin, if (getDatabase()->apiVersionAtLeast(630)) { if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() && end.getKey() <= specialKeys.end) { - TEST(true); // Special key space get range + CODE_PROBE(true, "Special key space get range"); return getDatabase()->specialKeySpace->getRange(this, begin, end, limits, reverse); } } else { - if (begin.getKey() == LiteralStringRef("\xff\xff/worker_interfaces")) { + if (begin.getKey() == "\xff\xff/worker_interfaces"_sr) { if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionRecord()) { return getWorkerInterfaces(tr.getDatabase()->getConnectionRecord()); } else { @@ -1648,7 +1649,7 @@ Future ReadYourWritesTransaction::getRange(KeySelector begin, // This optimization prevents nullptr operations from being added to the conflict range if (limits.isReached()) { - TEST(true); // RYW range read limit 0 + CODE_PROBE(true, "RYW range read limit 0"); return RangeResult(); } @@ -1662,7 +1663,7 @@ Future ReadYourWritesTransaction::getRange(KeySelector begin, end.removeOrEqual(end.arena()); if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) { - TEST(true); // RYW range inverted + CODE_PROBE(true, "RYW range inverted"); return RangeResult(); } @@ -1692,11 +1693,11 @@ Future ReadYourWritesTransaction::getMappedRange(KeySelector if (getDatabase()->apiVersionAtLeast(630)) { if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() && end.getKey() <= specialKeys.end) { - TEST(true); // Special key space get range (getMappedRange) + CODE_PROBE(true, "Special key space get range (getMappedRange)"); throw client_invalid_operation(); // Not support special keys. } } else { - if (begin.getKey() == LiteralStringRef("\xff\xff/worker_interfaces")) { + if (begin.getKey() == "\xff\xff/worker_interfaces"_sr) { throw client_invalid_operation(); // Not support special keys. } } @@ -1714,7 +1715,7 @@ Future ReadYourWritesTransaction::getMappedRange(KeySelector // This optimization prevents nullptr operations from being added to the conflict range if (limits.isReached()) { - TEST(true); // RYW range read limit 0 (getMappedRange) + CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)"); return MappedRangeResult(); } @@ -1728,7 +1729,7 @@ Future ReadYourWritesTransaction::getMappedRange(KeySelector end.removeOrEqual(end.arena()); if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) { - TEST(true); // RYW range inverted (getMappedRange) + CODE_PROBE(true, "RYW range inverted (getMappedRange)"); return MappedRangeResult(); } @@ -1783,7 +1784,8 @@ Future>> ReadYourWritesTransaction::getRangeSplitPo return waitOrError(tr.getRangeSplitPoints(range, chunkSize), resetPromise.getFuture()); } -Future>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range) { +Future>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range, + int rangeLimit) { if (checkUsedDuringCommit()) { return used_during_commit(); } @@ -1794,7 +1796,7 @@ Future>> ReadYourWritesTransaction::getBlobGra if (range.begin > maxKey || range.end > maxKey) return key_outside_legal_range(); - return waitOrError(tr.getBlobGranuleRanges(range), resetPromise.getFuture()); + return waitOrError(tr.getBlobGranuleRanges(range, rangeLimit), resetPromise.getFuture()); } Future>> ReadYourWritesTransaction::readBlobGranules( @@ -1821,6 +1823,32 @@ Future>> ReadYourWritesTransaction::re return waitOrError(tr.readBlobGranules(range, begin, readVersion, readVersionOut), resetPromise.getFuture()); } +Future>> ReadYourWritesTransaction::summarizeBlobGranules( + const KeyRange& range, + Optional summaryVersion, + int rangeLimit) { + + if (checkUsedDuringCommit()) { + return used_during_commit(); + } + + if (resetPromise.isSet()) + return resetPromise.getFuture().getError(); + + KeyRef maxKey = getMaxReadKey(); + if (range.begin > maxKey || range.end > maxKey) + return key_outside_legal_range(); + + return waitOrError(tr.summarizeBlobGranules(range, summaryVersion, rangeLimit), resetPromise.getFuture()); +} + +void ReadYourWritesTransaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) { + if (checkUsedDuringCommit()) { + throw used_during_commit(); + } + tr.addGranuleMaterializeStats(stats); +} + void ReadYourWritesTransaction::addReadConflictRange(KeyRangeRef const& keys) { if (checkUsedDuringCommit()) { throw used_during_commit(); @@ -1998,7 +2026,7 @@ void ReadYourWritesTransaction::setToken(uint64_t token) { } RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) { - TEST(true); // Special keys read conflict range + CODE_PROBE(true, "Special keys read conflict range"); ASSERT(readConflictRangeKeysRange.contains(kr)); ASSERT(!tr.trState->options.checkWritesEnabled); RangeResult result; @@ -2012,22 +2040,20 @@ RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRange if (kr.begin <= iter->begin() && iter->begin() < kr.end) { result.push_back(result.arena(), KeyValueRef(iter->begin().withPrefix(readConflictRangeKeysRange.begin, result.arena()), - iter->value() ? LiteralStringRef("1") : LiteralStringRef("0"))); + iter->value() ? "1"_sr : "0"_sr)); } } } else { - CoalescedKeyRefRangeMap readConflicts{ LiteralStringRef("0"), specialKeys.end }; + CoalescedKeyRefRangeMap readConflicts{ "0"_sr, specialKeys.end }; for (const auto& range : tr.readConflictRanges()) - readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), "1"_sr); for (const auto& range : nativeReadRanges) - readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + readConflicts.insert(range.withPrefix(readConflictRangeKeysRange.begin, result.arena()), "1"_sr); for (const auto& f : tr.getExtraReadConflictRanges()) { if (f.isReady() && f.get().first < f.get().second) readConflicts.insert(KeyRangeRef(f.get().first, f.get().second) .withPrefix(readConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + "1"_sr); } auto beginIter = readConflicts.rangeContaining(kr.begin); if (beginIter->begin() != kr.begin) @@ -2040,12 +2066,12 @@ RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRange } RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) { - TEST(true); // Special keys write conflict range + CODE_PROBE(true, "Special keys write conflict range"); ASSERT(writeConflictRangeKeysRange.contains(kr)); RangeResult result; // Memory owned by result - CoalescedKeyRefRangeMap writeConflicts{ LiteralStringRef("0"), specialKeys.end }; + CoalescedKeyRefRangeMap writeConflicts{ "0"_sr, specialKeys.end }; if (!options.readYourWritesDisabled) { KeyRangeRef strippedWriteRangePrefix = kr.removePrefix(writeConflictRangeKeysRange.begin); @@ -2058,15 +2084,13 @@ RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRang writeConflicts.insert( KeyRangeRef(it.beginKey().toArena(result.arena()), it.endKey().toArena(result.arena())) .withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + "1"_sr); } } else { for (const auto& range : tr.writeConflictRanges()) - writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), "1"_sr); for (const auto& range : nativeWriteRanges) - writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), "1"_sr); } for (const auto& k : versionStampKeys) { @@ -2086,8 +2110,7 @@ RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRang } else { range = getVersionstampKeyRange(result.arena(), k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey()); } - writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), - LiteralStringRef("1")); + writeConflicts.insert(range.withPrefix(writeConflictRangeKeysRange.begin, result.arena()), "1"_sr); } auto beginIter = writeConflicts.rangeContaining(kr.begin); @@ -2133,19 +2156,19 @@ void ReadYourWritesTransaction::atomicOp(const KeyRef& key, const ValueRef& oper KeyRef k; if (!tr.apiVersionAtLeast(520) && operationType == MutationRef::SetVersionstampedKey) { - k = key.withSuffix(LiteralStringRef("\x00\x00"), arena); + k = key.withSuffix("\x00\x00"_sr, arena); } else { k = KeyRef(arena, key); } ValueRef v; if (!tr.apiVersionAtLeast(520) && operationType == MutationRef::SetVersionstampedValue) { - v = operand.withSuffix(LiteralStringRef("\x00\x00\x00\x00"), arena); + v = operand.withSuffix("\x00\x00\x00\x00"_sr, arena); } else { v = ValueRef(arena, operand); } if (operationType == MutationRef::SetVersionstampedKey) { - TEST(options.readYourWritesDisabled); // SetVersionstampedKey without ryw enabled + CODE_PROBE(options.readYourWritesDisabled, "SetVersionstampedKey without ryw enabled"); // this does validation of the key and needs to be performed before the readYourWritesDisabled path KeyRangeRef range = getVersionstampKeyRange(arena, k, tr.getCachedReadVersion().orDefault(0), getMaxReadKey()); versionStampKeys.push_back(arena, k); @@ -2191,17 +2214,17 @@ void ReadYourWritesTransaction::set(const KeyRef& key, const ValueRef& value) { } else { // These three special keys are deprecated in 7.0 and an alternative C API is added // TODO : Rewrite related code using C api - if (key == LiteralStringRef("\xff\xff/reboot_worker")) { + if (key == "\xff\xff/reboot_worker"_sr) { BinaryReader::fromStringRef(value, IncludeVersion()) .reboot.send(RebootRequest()); return; } - if (key == LiteralStringRef("\xff\xff/suspend_worker")) { + if (key == "\xff\xff/suspend_worker"_sr) { BinaryReader::fromStringRef(value, IncludeVersion()) .reboot.send(RebootRequest(false, false, options.timeoutInSeconds)); return; } - if (key == LiteralStringRef("\xff\xff/reboot_and_check_worker")) { + if (key == "\xff\xff/reboot_and_check_worker"_sr) { BinaryReader::fromStringRef(value, IncludeVersion()) .reboot.send(RebootRequest(false, true)); return; diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index fc45e0a0fd..b32e349631 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -20,8 +20,8 @@ #include "fdbclient/S3BlobStore.h" -#include "fdbclient/md5/md5.h" -#include "fdbclient/libb64/encode.h" +#include "md5/md5.h" +#include "libb64/encode.h" #include "fdbclient/sha1/SHA1.h" #include #include @@ -98,7 +98,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() { bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) { #define TRY_PARAM(n, sn) \ - if (name == LiteralStringRef(#n) || name == LiteralStringRef(#sn)) { \ + if (name == #n || name == #sn) { \ n = value; \ return true; \ } @@ -109,7 +109,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) { TRY_PARAM(request_tries, rt); TRY_PARAM(request_timeout_min, rtom); // TODO: For backward compatibility because request_timeout was renamed to request_timeout_min - if (name == LiteralStringRef("request_timeout") || name == LiteralStringRef("rto")) { + if (name == "request_timeout"_sr || name == "rto"_sr) { request_timeout_min = value; return true; } @@ -187,7 +187,7 @@ std::string guessRegionFromDomain(std::string domain) { StringRef h(domain.c_str() + p); - if (!h.startsWith(LiteralStringRef("oss-"))) { + if (!h.startsWith("oss-"_sr)) { h.eat(service); // ignore s3 service } @@ -208,7 +208,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string try { StringRef t(url); StringRef prefix = t.eat("://"); - if (prefix != LiteralStringRef("blobstore")) + if (prefix != "blobstore"_sr) throw format("Invalid blobstore URL prefix '%s'", prefix.toString().c_str()); Optional proxyHost, proxyPort; @@ -261,7 +261,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string StringRef value = t.eat("&"); // Special case for header - if (name == LiteralStringRef("header")) { + if (name == "header"_sr) { StringRef originalValue = value; StringRef headerFieldName = value.eat(":"); StringRef headerFieldValue = value; @@ -282,7 +282,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string } // overwrite s3 region from parameter - if (name == LiteralStringRef("region")) { + if (name == "region"_sr) { region = value.toString(); continue; } @@ -476,7 +476,7 @@ ACTOR Future deleteRecursively_impl(Reference b, state Future done = b->listObjectsStream(bucket, resultStream, prefix, '/', std::numeric_limits::max()); // Wrap done in an actor which will send end_of_stream since listObjectsStream() does not (so that many calls can // write to the same stream) - done = map(done, [=](Void) { + done = map(done, [=](Void) mutable { resultStream.sendError(end_of_stream()); return Void(); }); @@ -737,16 +737,21 @@ ACTOR Future connect_impl(Referenceknobs.secure_connection ? "https" : "http"; } bool isTLS = b->knobs.secure_connection == 1; + state Reference conn; if (b->useProxy) { - // TODO(renxuan): Support http proxy + TLS - if (isTLS || b->service == "443") { - fprintf(stderr, "ERROR: TLS is not supported yet when using HTTP proxy.\n"); - throw connection_failed(); + if (isTLS) { + Reference _conn = + wait(HTTP::proxyConnect(host, service, b->proxyHost.get(), b->proxyPort.get())); + conn = _conn; + } else { + host = b->proxyHost.get(); + service = b->proxyPort.get(); + Reference _conn = wait(INetworkConnections::net()->connect(host, service, false)); + conn = _conn; } - host = b->proxyHost.get(); - service = b->proxyPort.get(); + } else { + wait(store(conn, INetworkConnections::net()->connect(host, service, isTLS))); } - state Reference conn = wait(INetworkConnections::net()->connect(host, service, isTLS)); wait(conn->connectHandshake()); TraceEvent("S3BlobStoreEndpointNewConnection") @@ -894,7 +899,7 @@ ACTOR Future> doRequest_impl(ReferenceuseProxy) { + if (bstore->useProxy && bstore->knobs.secure_connection == 0) { // Has to be in absolute-form. canonicalURI = "http://" + bstore->host + ":" + bstore->service + canonicalURI; } @@ -1190,7 +1195,7 @@ ACTOR Future listObjects_impl(ReferencelistObjectsStream(bucket, resultStream, prefix, delimiter, maxDepth, recurseFilter); // Wrap done in an actor which sends end_of_stream because list does not so that many lists can write to the same // stream - done = map(done, [=](Void) { + done = map(done, [=](Void) mutable { resultStream.sendError(end_of_stream()); return Void(); }); @@ -1427,7 +1432,7 @@ void S3BlobStoreEndpoint::setV4AuthHeaders(std::string const& verb, if (headers.find("Content-MD5") != headers.end()) headersList.push_back({ "content-md5", trim_copy(headers["Content-MD5"]) + "\n" }); for (auto h : headers) { - if (StringRef(h.first).startsWith(LiteralStringRef("x-amz"))) + if (StringRef(h.first).startsWith("x-amz"_sr)) headersList.push_back({ to_lower_copy(h.first), trim_copy(h.second) + "\n" }); } std::sort(headersList.begin(), headersList.end()); @@ -1488,7 +1493,7 @@ void S3BlobStoreEndpoint::setAuthHeaders(std::string const& verb, std::string co msg.append("\n"); for (auto h : headers) { StringRef name = h.first; - if (name.startsWith(LiteralStringRef("x-amz")) || name.startsWith(LiteralStringRef("x-icloud"))) { + if (name.startsWith("x-amz"_sr) || name.startsWith("x-icloud"_sr)) { msg.append(h.first); msg.append(":"); msg.append(h.second); diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 7cc2079e27..cea30ca663 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -21,7 +21,7 @@ #include "fdbclient/Schemas.h" // NOTE: also change mr-status-json-schemas.rst.inc -const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( +const KeyRef JSONSchemas::statusSchema = R"statusSchema( { "cluster":{ "storage_wiggler": { @@ -137,6 +137,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "blob_manager", "blob_worker", "encrypt_key_proxy", + "consistency_scan", "storage_cache", "router", "coordinator" @@ -427,7 +428,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -448,7 +451,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -557,6 +562,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "unreachable_ratekeeper_worker", "unreachable_blobManager_worker", "unreachable_encryptKeyProxy_worker", + "unreachable_consistencyScan_worker", "unreadable_configuration", "full_replication_timeout", "client_issues", @@ -596,7 +602,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( } ], )statusSchema" - R"statusSchema( + R"statusSchema( "recovery_state":{ "seconds_since_last_recovered":1, "required_resolvers":1, @@ -844,8 +850,26 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "disabled", "optional_experimental", "required_experimental" + ]}, + "encryption_at_rest_mode": { + "$enum":[ + "disabled", + "aes_256_ctr" ]} }, + "consistency_scan_info":{ + "consistency_scan_enabled":false, + "restart":false, + "max_rate":0, + "target_interval":0, + "bytes_read_prev_round":0, + "last_round_start_datetime":"2022-04-20 00:05:05.123 +0000", + "last_round_finish_datetime":"1970-01-01 00:00:00.000 +0000", + "last_round_start_timestamp":1648857905.123, + "last_round_finish_timestamp":0, + "smoothed_round_seconds":1, + "finished_rounds":1 + }, "data":{ "least_operating_space_bytes_log_server":0, "average_partition_size_bytes":0, @@ -981,9 +1005,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "up_to_date":true } } -})statusSchema"); +})statusSchema"_sr; -const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"configSchema( +const KeyRef JSONSchemas::clusterConfigurationSchema = R"configSchema( { "create":{ "$enum":[ @@ -1053,9 +1077,9 @@ const KeyRef JSONSchemas::clusterConfigurationSchema = LiteralStringRef(R"config "auto_logs":3, "commit_proxies":5, "grv_proxies":1 -})configSchema"); +})configSchema"_sr; -const KeyRef JSONSchemas::latencyBandConfigurationSchema = LiteralStringRef(R"configSchema( +const KeyRef JSONSchemas::latencyBandConfigurationSchema = R"configSchema( { "get_read_version":{ "bands":[ @@ -1075,30 +1099,30 @@ const KeyRef JSONSchemas::latencyBandConfigurationSchema = LiteralStringRef(R"co ], "max_commit_bytes":0 } -})configSchema"); +})configSchema"_sr; -const KeyRef JSONSchemas::dataDistributionStatsSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::dataDistributionStatsSchema = R"""( { "shard_bytes": 1947000 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::logHealthSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::logHealthSchema = R"""( { "log_queue": 156 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::storageHealthSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::storageHealthSchema = R"""( { "cpu_usage": 3.28629447047675, "disk_usage": 0.19997897369207954, "storage_durability_lag": 5050809, "storage_queue": 2030 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::aggregateHealthSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::aggregateHealthSchema = R"""( { "batch_limited": false, "limiting_storage_durability_lag": 5050809, @@ -1108,12 +1132,12 @@ const KeyRef JSONSchemas::aggregateHealthSchema = LiteralStringRef(R"""( "worst_storage_queue": 2030, "worst_log_queue": 156 } -)"""); +)"""_sr; -const KeyRef JSONSchemas::managementApiErrorSchema = LiteralStringRef(R"""( +const KeyRef JSONSchemas::managementApiErrorSchema = R"""( { "retriable": false, "command": "exclude", "message": "The reason of the error" } -)"""); +)"""_sr; diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 9f00bdf594..6304db86ce 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/ServerKnobs.h" +#include "flow/CompressionUtils.h" #include "flow/IRandom.h" #include "flow/flow.h" @@ -50,7 +51,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // TLogs init( TLOG_TIMEOUT, 0.4 ); //cannot buggify because of availability init( TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS, 60 ); if( randomize && BUGGIFY ) TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS = deterministicRandom()->randomInt(5,10); - init( RECOVERY_TLOG_SMART_QUORUM_DELAY, 0.25 ); if( randomize && BUGGIFY ) RECOVERY_TLOG_SMART_QUORUM_DELAY = 0.0; // smaller might be better for bug amplification init( TLOG_STORAGE_MIN_UPDATE_INTERVAL, 0.5 ); init( BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL, 30 ); init( DESIRED_TOTAL_BYTES, 150000 ); if( randomize && BUGGIFY ) DESIRED_TOTAL_BYTES = 10000; @@ -58,10 +58,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( UPDATE_DELAY, 0.001 ); init( MAXIMUM_PEEK_BYTES, 10e6 ); init( APPLY_MUTATION_BYTES, 1e6 ); - init( RECOVERY_DATA_BYTE_LIMIT, 100000 ); - init( BUGGIFY_RECOVERY_DATA_LIMIT, 1000 ); - init( LONG_TLOG_COMMIT_TIME, 0.25 ); //cannot buggify because of recovery time - init( LARGE_TLOG_COMMIT_BYTES, 4<<20 ); init( BUGGIFY_RECOVER_MEMORY_LIMIT, 1e6 ); init( BUGGIFY_WORKER_REMOVED_MAX_LAG, 30 ); init( UPDATE_STORAGE_BYTE_LIMIT, 1e6 ); @@ -94,7 +90,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_CACHE_VERSIONS, 10e6 ); init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); - init( TLOG_MAX_CREATE_DURATION, 10.0 ); + // In some rare simulation tests, particularly with log_spill:=1 configured, the 10 second limit is exceeded, causing SevError trace events + // and simulation test failure. Increasing the knob value to 15.0 in simulation is a workaround to avoid these failures. + init( TLOG_MAX_CREATE_DURATION, 10.0 ); if (isSimulated) TLOG_MAX_CREATE_DURATION = 15.0; init( PEEK_LOGGING_AMOUNT, 5 ); init( PEEK_LOGGING_DELAY, 5.0 ); init( PEEK_RESET_INTERVAL, 300.0 ); if ( randomize && BUGGIFY ) PEEK_RESET_INTERVAL = 20.0; @@ -133,16 +131,15 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BG_REBALANCE_POLLING_INTERVAL, 10.0 ); init( BG_REBALANCE_SWITCH_CHECK_INTERVAL, 5.0 ); if (randomize && BUGGIFY) BG_REBALANCE_SWITCH_CHECK_INTERVAL = 1.0; init( DD_QUEUE_LOGGING_INTERVAL, 5.0 ); + init( DD_QUEUE_COUNTER_REFRESH_INTERVAL, 60.0 ); + // 100 / 60 < 2 trace/sec ~ 2 * 200 = 400b/sec + init( DD_QUEUE_COUNTER_MAX_LOG, 100 ); if( randomize && BUGGIFY ) DD_QUEUE_COUNTER_MAX_LOG = 1; + init( DD_QUEUE_COUNTER_SUMMARIZE, true ); init( RELOCATION_PARALLELISM_PER_SOURCE_SERVER, 2 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_SOURCE_SERVER = 1; init( RELOCATION_PARALLELISM_PER_DEST_SERVER, 10 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_DEST_SERVER = 1; // Note: if this is smaller than FETCH_KEYS_PARALLELISM, this will artificially reduce performance. The current default of 10 is probably too high but is set conservatively for now. init( DD_QUEUE_MAX_KEY_SERVERS, 100 ); if( randomize && BUGGIFY ) DD_QUEUE_MAX_KEY_SERVERS = 1; init( DD_REBALANCE_PARALLELISM, 50 ); init( DD_REBALANCE_RESET_AMOUNT, 30 ); - init( BG_DD_MAX_WAIT, 120.0 ); - init( BG_DD_MIN_WAIT, 0.1 ); - init( BG_DD_INCREASE_RATE, 1.10 ); - init( BG_DD_DECREASE_RATE, 1.02 ); - init( BG_DD_SATURATION_DELAY, 1.0 ); init( INFLIGHT_PENALTY_HEALTHY, 1.0 ); init( INFLIGHT_PENALTY_UNHEALTHY, 500.0 ); init( INFLIGHT_PENALTY_ONE_LEFT, 1000.0 ); @@ -165,8 +162,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( PRIORITY_TEAM_FAILED, 805 ); init( PRIORITY_TEAM_0_LEFT, 809 ); init( PRIORITY_SPLIT_SHARD, 950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350; + init( PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD, 960 ); if( randomize && BUGGIFY ) PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD = 360; // Set as the lowest priority // Data distribution + init( SHARD_ENCODE_LOCATION_METADATA, false ); if( randomize && BUGGIFY ) SHARD_ENCODE_LOCATION_METADATA = true; + init( ENABLE_DD_PHYSICAL_SHARD, false ); // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true; When true, optimization of data move between DCs is disabled + init( MAX_PHYSICAL_SHARD_BYTES, 500000000 ); // 500 MB; for ENABLE_DD_PHYSICAL_SHARD; smaller leads to larger number of physicalShard per storage server + init( PHYSICAL_SHARD_METRICS_DELAY, 300.0 ); // 300 seconds; for ENABLE_DD_PHYSICAL_SHARD + init( ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME, 600.0 ); if( randomize && BUGGIFY ) ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME = 0.0; // 600 seconds; for ENABLE_DD_PHYSICAL_SHARD init( READ_REBALANCE_CPU_THRESHOLD, 15.0 ); init( READ_REBALANCE_SRC_PARALLELISM, 20 ); init( READ_REBALANCE_SHARD_TOPK, READ_REBALANCE_SRC_PARALLELISM * 2 ); @@ -249,7 +252,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( SERVER_LIST_DELAY, 1.0 ); init( RECRUITMENT_IDLE_DELAY, 1.0 ); init( STORAGE_RECRUITMENT_DELAY, 10.0 ); - init( BLOB_WORKER_RECRUITMENT_DELAY, 10.0 ); init( TSS_HACK_IDENTITY_MAPPING, false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail init( TSS_DD_CHECK_INTERVAL, 60.0 ); if (randomize && BUGGIFY ) TSS_DD_CHECK_INTERVAL = 1.0; // May kill all TSS quickly @@ -275,7 +277,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) DD_FAILURE_TIME = 10.0; init( DD_ZERO_HEALTHY_TEAM_DELAY, 1.0 ); init( REMOTE_KV_STORE, false ); - init( REMOTE_KV_STORE_INIT_DELAY, 0.1 ); + init( REBOOT_KV_STORE_DELAY, 0.1 ); init( REMOTE_KV_STORE_MAX_INIT_DURATION, 10.0 ); init( REBALANCE_MAX_RETRIES, 100 ); init( DD_OVERLAP_PENALTY, 10000 ); @@ -291,8 +293,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5; init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000; init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 ); + init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_TENANT_AWARENESS_ENABLED, false ); - init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2.0 ); + init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true @@ -370,19 +374,26 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REPLACE_CONTENTS_BYTES, 1e5 ); // KeyValueStoreRocksDB + init( ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES, true ); if( randomize && BUGGIFY ) ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES = false; + init( ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE, true ); if( randomize && BUGGIFY ) ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE = false; + init( ROCKSDB_READ_RANGE_ROW_LIMIT, 65535 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ROW_LIMIT = deterministicRandom()->randomInt(2, 10); + init( ROCKSDB_BACKGROUND_PARALLELISM, 4 ); init( ROCKSDB_READ_PARALLELISM, 4 ); // Use a smaller memtable in simulation to avoid OOMs. int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024; init( ROCKSDB_MEMTABLE_BYTES, memtableBytes ); + init( ROCKSDB_LEVEL_STYLE_COMPACTION, true ); init( ROCKSDB_UNSAFE_AUTO_FSYNC, false ); init( ROCKSDB_PERIODIC_COMPACTION_SECONDS, 0 ); init( ROCKSDB_PREFIX_LEN, 0 ); - init( ROCKSDB_BLOCK_CACHE_SIZE, 0 ); + // If rocksdb block cache size is 0, the default 8MB is used. + int64_t blockCacheSize = isSimulated ? 0 : 1024 * 1024 * 1024 /* 1GB */; + init( ROCKSDB_BLOCK_CACHE_SIZE, blockCacheSize ); init( ROCKSDB_METRICS_DELAY, 60.0 ); - init( ROCKSDB_READ_VALUE_TIMEOUT, 5.0 ); - init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, 5.0 ); - init( ROCKSDB_READ_RANGE_TIMEOUT, 5.0 ); + init( ROCKSDB_READ_VALUE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + init( ROCKSDB_READ_RANGE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); init( ROCKSDB_READ_QUEUE_WAIT, 1.0 ); init( ROCKSDB_READ_QUEUE_HARD_MAX, 1000 ); init( ROCKSDB_READ_QUEUE_SOFT_MAX, 500 ); @@ -396,9 +407,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO. init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE, true ); init( DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, "fdb"); + init( ROCKSDB_DISABLE_AUTO_COMPACTIONS, false ); // RocksDB default init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true; init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 ); + init( ROCKSDB_METRICS_SAMPLE_INTERVAL, 0.0); init( ROCKSDB_MAX_SUBCOMPACTIONS, 2 ); init( ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT, 64000000000 ); // 64GB, Rocksdb option, Writes will slow down. init( ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT, 100000000000 ); // 100GB, Rocksdb option, Writes will stall. @@ -411,6 +424,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_COMPACTION_READAHEAD_SIZE, 32768 ); // 32 KB, performs bigger reads when doing compaction. init( ROCKSDB_BLOCK_SIZE, 32768 ); // 32 KB, size of the block in rocksdb cache. init( ENABLE_SHARDED_ROCKSDB, false ); + init( ROCKSDB_WRITE_BUFFER_SIZE, 1 << 30 ); // 1G + init( ROCKSDB_CF_WRITE_BUFFER_SIZE, 64 << 20 ); // 64M, RocksDB default. + init( ROCKSDB_MAX_TOTAL_WAL_SIZE, 0 ); // RocksDB default. + init( ROCKSDB_MAX_BACKGROUND_JOBS, 2 ); // RocksDB default. + init( ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD, 21600 ); // 6h, RocksDB default. + init( ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY, isSimulated ? 10.0 : 300.0 ); // Delays shard clean up, must be larger than ROCKSDB_READ_VALUE_TIMEOUT to prevent reading deleted shard. // Leader election bool longLeaderElection = randomize && BUGGIFY; @@ -474,7 +493,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REPORT_TRANSACTION_COST_ESTIMATION_DELAY, 0.1 ); init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG, true ); - bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST; + bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST; init( PROXY_USE_RESOLVER_PRIVATE_MUTATIONS, false ); if( buggfyUseResolverPrivateMutations ) PROXY_USE_RESOLVER_PRIVATE_MUTATIONS = deterministicRandom()->coinflip(); init( RESET_MASTER_BATCHES, 200 ); @@ -482,6 +501,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( RESET_MASTER_DELAY, 300.0 ); init( RESET_RESOLVER_DELAY, 300.0 ); + init( GLOBAL_CONFIG_MIGRATE_TIMEOUT, 5.0 ); + init( GLOBAL_CONFIG_REFRESH_INTERVAL, 1.0 ); if ( randomize && BUGGIFY ) GLOBAL_CONFIG_REFRESH_INTERVAL = 0.1; + init( GLOBAL_CONFIG_REFRESH_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) GLOBAL_CONFIG_REFRESH_TIMEOUT = 1.0; + // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) // by delay()ing for this amount of time between accepted batches of TransactionRequests. @@ -530,6 +553,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ATTEMPT_RECRUITMENT_DELAY, 0.035 ); init( WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 1.0 ); init( WAIT_FOR_RATEKEEPER_JOIN_DELAY, 1.0 ); + init( WAIT_FOR_CONSISTENCYSCAN_JOIN_DELAY, 1.0 ); init( WAIT_FOR_BLOB_MANAGER_JOIN_DELAY, 1.0 ); init( WAIT_FOR_ENCRYPT_KEY_PROXY_JOIN_DELAY, 1.0 ); init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0; @@ -540,6 +564,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( CHECK_REMOTE_HEALTH_INTERVAL, 60 ); init( FORCE_RECOVERY_CHECK_DELAY, 5.0 ); init( RATEKEEPER_FAILURE_TIME, 1.0 ); + init( CONSISTENCYSCAN_FAILURE_TIME, 1.0 ); init( BLOB_MANAGER_FAILURE_TIME, 1.0 ); init( REPLACE_INTERFACE_DELAY, 60.0 ); init( REPLACE_INTERFACE_CHECK_DELAY, 5.0 ); @@ -605,9 +630,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( SLOW_SMOOTHING_AMOUNT, 10.0 ); if( slowRatekeeper ) SLOW_SMOOTHING_AMOUNT = 50.0; init( METRIC_UPDATE_RATE, .1 ); if( slowRatekeeper ) METRIC_UPDATE_RATE = 0.5; init( DETAILED_METRIC_UPDATE_RATE, 5.0 ); - init (RATEKEEPER_DEFAULT_LIMIT, 1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0; + init( RATEKEEPER_DEFAULT_LIMIT, 1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0; init( RATEKEEPER_LIMIT_REASON_SAMPLE_RATE, 0.1 ); init( RATEKEEPER_PRINT_LIMIT_REASON, false ); if( randomize && BUGGIFY ) RATEKEEPER_PRINT_LIMIT_REASON = true; + init( RATEKEEPER_MIN_RATE, 0.0 ); + init( RATEKEEPER_MAX_RATE, 1e9 ); + init( RATEKEEPER_BATCH_MIN_RATE, 0.0 ); + init( RATEKEEPER_BATCH_MAX_RATE, 1e9 ); bool smallStorageTarget = randomize && BUGGIFY; init( TARGET_BYTES_PER_STORAGE_SERVER, 1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3; @@ -617,9 +646,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( SPRING_BYTES_STORAGE_SERVER_BATCH, 100e6 ); if( smallStorageTarget ) SPRING_BYTES_STORAGE_SERVER_BATCH = 150e3; init( STORAGE_HARD_LIMIT_BYTES, 1500e6 ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES = 4500e3; init( STORAGE_HARD_LIMIT_BYTES_OVERAGE, 5000e3 ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES_OVERAGE = 100e3; // byte+version overage ensures storage server makes enough progress on freeing up storage queue memory at hard limit by ensuring it advances desiredOldestVersion enough per commit cycle. + init( STORAGE_HARD_LIMIT_BYTES_SPEED_UP_SIM, STORAGE_HARD_LIMIT_BYTES ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES_SPEED_UP_SIM *= 10; + init( STORAGE_HARD_LIMIT_BYTES_OVERAGE_SPEED_UP_SIM, STORAGE_HARD_LIMIT_BYTES_OVERAGE ); if( smallStorageTarget ) STORAGE_HARD_LIMIT_BYTES_OVERAGE_SPEED_UP_SIM *= 10; init( STORAGE_HARD_LIMIT_VERSION_OVERAGE, VERSIONS_PER_SECOND / 4.0 ); init( STORAGE_DURABILITY_LAG_HARD_MAX, 2000e6 ); if( smallStorageTarget ) STORAGE_DURABILITY_LAG_HARD_MAX = 100e6; init( STORAGE_DURABILITY_LAG_SOFT_MAX, 250e6 ); if( smallStorageTarget ) STORAGE_DURABILITY_LAG_SOFT_MAX = 10e6; + init( STORAGE_INCLUDE_FEED_STORAGE_QUEUE, true ); if ( randomize && BUGGIFY ) STORAGE_INCLUDE_FEED_STORAGE_QUEUE = false; //FIXME: Low priority reads are disabled by assigning very high knob values, reduce knobs for 7.0 init( LOW_PRIORITY_STORAGE_QUEUE_BYTES, 775e8 ); if( smallStorageTarget ) LOW_PRIORITY_STORAGE_QUEUE_BYTES = 1750e3; @@ -657,6 +689,20 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 ); init( DURABILITY_LAG_INCREASE_RATE, 1.001 ); init( STORAGE_SERVER_LIST_FETCH_TIMEOUT, 20.0 ); + init( BW_THROTTLING_ENABLED, true ); + + bool buggifySmallBWLag = randomize && BUGGIFY; + init( TARGET_BW_LAG, 240.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0; + init( TARGET_BW_LAG_BATCH, 200.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0; + init( TARGET_BW_LAG_UPDATE, 9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0; + init( MIN_BW_HISTORY, 10 ); + init( BW_ESTIMATION_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0; + init( BW_LAG_INCREASE_AMOUNT, 1.1 ); + init( BW_LAG_DECREASE_AMOUNT, 0.9 ); + init( BW_FETCH_WORKERS_INTERVAL, 5.0 ); + init( BW_RW_LOGGING_INTERVAL, 5.0 ); + init( BW_MAX_BLOCKED_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_MAX_BLOCKED_INTERVAL = 2.0; + init( BW_RK_SIM_QUIESCE_DELAY, 150.0 ); init( MAX_AUTO_THROTTLED_TRANSACTION_TAGS, 5 ); if(randomize && BUGGIFY) MAX_AUTO_THROTTLED_TRANSACTION_TAGS = 1; init( MAX_MANUAL_THROTTLED_TRANSACTION_TAGS, 40 ); if(randomize && BUGGIFY) MAX_MANUAL_THROTTLED_TRANSACTION_TAGS = 1; @@ -671,9 +717,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false; init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10); init( GLOBAL_TAG_THROTTLING, false ); + init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false ); init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); - init( GLOBAL_TAG_THROTTLING_TRACE_INTERVAL, 5.0 ); + init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 ); //Storage Metrics init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 ); @@ -694,12 +741,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_LIMIT_BYTES, 500000 ); init( BUGGIFY_LIMIT_BYTES, 1000 ); init( FETCH_USING_STREAMING, false ); if( randomize && isSimulated && BUGGIFY ) FETCH_USING_STREAMING = true; //Determines if fetch keys uses streaming reads + init( FETCH_USING_BLOB, false ); init( FETCH_BLOCK_BYTES, 2e6 ); init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6; init( FETCH_KEYS_PARALLELISM, 2 ); + init( FETCH_KEYS_PARALLELISM_FULL, 6 ); init( FETCH_KEYS_LOWER_PRIORITY, 0 ); - init( FETCH_CHANGEFEED_PARALLELISM, 2 ); init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 ); + init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20; init( BUGGIFY_BLOCK_BYTES, 10000 ); init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS ); init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000; @@ -707,7 +756,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD, 0.25 ); init( STORAGE_DURABILITY_LAG_MIN_RATE, 0.1 ); init( STORAGE_COMMIT_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0; - init( UPDATE_SHARD_VERSION_INTERVAL, 0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0; init( BYTE_SAMPLING_FACTOR, 250 ); //cannot buggify because of differences in restarting tests init( BYTE_SAMPLING_OVERHEAD, 100 ); init( MAX_STORAGE_SERVER_WATCH_BYTES, 100e6 ); if( randomize && BUGGIFY ) MAX_STORAGE_SERVER_WATCH_BYTES = 10e3; @@ -716,7 +764,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BYTE_SAMPLE_LOAD_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_PARALLELISM = 1; init( BYTE_SAMPLE_LOAD_DELAY, 0.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.1; init( BYTE_SAMPLE_START_DELAY, 1.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_START_DELAY = 0.0; - init( UPDATE_STORAGE_PROCESS_STATS_INTERVAL, 5.0 ); init( BEHIND_CHECK_DELAY, 2.0 ); init( BEHIND_CHECK_COUNT, 2 ); init( BEHIND_CHECK_VERSIONS, 5 * VERSIONS_PER_SECOND ); @@ -724,7 +771,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MIN_TAG_READ_PAGES_RATE, 1.0e4 ); if( randomize && BUGGIFY ) MIN_TAG_READ_PAGES_RATE = 0; init( MIN_TAG_WRITE_PAGES_RATE, 3200 ); if( randomize && BUGGIFY ) MIN_TAG_WRITE_PAGES_RATE = 0; init( TAG_MEASUREMENT_INTERVAL, 30.0 ); if( randomize && BUGGIFY ) TAG_MEASUREMENT_INTERVAL = 1.0; - init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096; init( PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS, true ); if( randomize && BUGGIFY ) PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS = false; init( REPORT_DD_METRICS, true ); init( DD_METRICS_REPORT_INTERVAL, 30.0 ); @@ -740,6 +786,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); + init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 ); //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; @@ -760,13 +807,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ENABLE_WORKER_HEALTH_MONITOR, false ); init( WORKER_HEALTH_MONITOR_INTERVAL, 60.0 ); init( PEER_LATENCY_CHECK_MIN_POPULATION, 30 ); - init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.90 ); + init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.50 ); init( PEER_LATENCY_DEGRADATION_THRESHOLD, 0.05 ); - init( PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE, 0.90 ); + init( PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE, 0.50 ); init( PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE, 0.1 ); init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD, 0.1 ); - init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 1 ); + init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 5 ); init( WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER, true ); + init( STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT, false ); if ( randomize && BUGGIFY ) STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT = true; // Test harness init( WORKER_POLL_DELAY, 1.0 ); @@ -779,7 +827,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // Dynamic Knobs (implementation) init( COMPACTION_INTERVAL, isSimulated ? 5.0 : 300.0 ); - init( UPDATE_NODE_TIMEOUT, 3.0 ); + init( BROADCASTER_SELF_UPDATE_DELAY, 1.0 ); init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 ); init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT, 3.0 ); init( FETCH_CHANGES_TIMEOUT, 3.0 ); @@ -795,14 +843,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DISABLE_DUPLICATE_LOG_WARNING, false ); init( HISTOGRAM_REPORT_INTERVAL, 300.0 ); - // IPager - init( PAGER_RESERVED_PAGES, 1 ); - - // IndirectShadowPager - init( FREE_PAGE_VACUUM_THRESHOLD, 1 ); - init( VACUUM_QUEUE_SIZE, 100000 ); - init( VACUUM_BYTES_PER_SECOND, 1e6 ); - // Timekeeper init( TIME_KEEPER_DELAY, 10 ); init( TIME_KEEPER_MAX_ENTRIES, 3600 * 24 * 30 * 6 ); if( randomize && BUGGIFY ) { TIME_KEEPER_MAX_ENTRIES = 2; } @@ -821,11 +861,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( FASTRESTORE_ROLE_LOGGING_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; } init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; } init( FASTRESTORE_ATOMICOP_WEIGHT, 1 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; } - init( FASTRESTORE_APPLYING_PARALLELISM, 10000 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; } init( FASTRESTORE_MONITOR_LEADER_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; } init( FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS, 60 ); if( randomize && BUGGIFY ) { FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS = deterministicRandom()->random01() * 240 + 10; } init( FASTRESTORE_TRACK_REQUEST_LATENCY, false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_REQUEST_LATENCY = false; } - init( FASTRESTORE_TRACK_LOADER_SEND_REQUESTS, false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_LOADER_SEND_REQUESTS = true; } init( FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT, 6144 ); if( randomize && BUGGIFY ) { FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT = 1; } init( FASTRESTORE_WAIT_FOR_MEMORY_LATENCY, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_WAIT_FOR_MEMORY_LATENCY = 60; } init( FASTRESTORE_HEARTBEAT_DELAY, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_DELAY = deterministicRandom()->random01() * 120 + 2; } @@ -882,52 +920,60 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX, "Master" ); // Encryption - init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) { ENABLE_ENCRYPTION = deterministicRandom()->coinflip(); } + init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) ENABLE_ENCRYPTION = !ENABLE_ENCRYPTION; init( ENCRYPTION_MODE, "AES-256-CTR" ); init( SIM_KMS_MAX_KEYS, 4096 ); init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000 ); - init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); } - init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_ENCRYPTION = (ENABLE_ENCRYPTION && deterministicRandom()->coinflip()); } + init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true; + init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION; + init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION; // encrypt key proxy init( ENABLE_BLOB_GRANULE_COMPRESSION, false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); } - init( BLOB_GRANULE_COMPRESSION_FILTER, "GZIP" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = "NONE"; } + init( BLOB_GRANULE_COMPRESSION_FILTER, "NONE" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = CompressionUtils::toString(CompressionUtils::getRandomFilter()); } - - // KMS connector type + // KMS connector type init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" ); // Blob granlues - init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually - // BlobGranuleVerify* simulation tests use "blobRangeKeys", BlobGranuleCorrectness* use "tenant", default in real clusters is "tenant" - init( BG_RANGE_SOURCE, "tenant" ); + init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually + bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY); // BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs" init( BG_METADATA_SOURCE, "knobs" ); - init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (simulationMediumShards || (randomize && BUGGIFY) ) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000; - init( BG_SNAPSHOT_FILE_TARGET_CHUNKS, 100 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNKS = 1 << deterministicRandom()->randomInt(0, 8); + init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000; + init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8)); init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 ); init( BG_DELTA_FILE_TARGET_BYTES, BG_DELTA_BYTES_BEFORE_COMPACT/10 ); + init( BG_DELTA_FILE_TARGET_CHUNK_BYTES, 32*1024 ); if ( randomize && BUGGIFY ) BG_DELTA_FILE_TARGET_CHUNK_BYTES = BG_DELTA_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 7)); init( BG_MAX_SPLIT_FANOUT, 10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15); init( BG_MAX_MERGE_FANIN, 10 ); if( randomize && BUGGIFY ) BG_MAX_MERGE_FANIN = deterministicRandom()->randomInt(2, 15); init( BG_HOT_SNAPSHOT_VERSIONS, 5000000 ); init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false; init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10); + init( BG_KEY_TUPLE_TRUNCATE_OFFSET, 0 ); init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false; init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0; - + init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 ); init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1; + init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10); + init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100); init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0; init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0; init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 ); init( BLOB_WORKER_BATCH_GRV_INTERVAL, 0.1 ); - + init( BLOB_WORKER_DO_REJECT_WHEN_FULL, true ); if ( randomize && BUGGIFY ) BLOB_WORKER_DO_REJECT_WHEN_FULL = false; + init( BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD, 0.9 ); + init( BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY, 30.0 ); if ( randomize && BUGGIFY ) BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY = deterministicRandom()->randomInt(0, 10) - 1; init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN, 0.1 ); init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX, 5.0 ); init( BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT, 1.5 ); + init( BLOB_MANAGER_CONCURRENT_MERGE_CHECKS, 64 ); if( randomize && BUGGIFY ) BLOB_MANAGER_CONCURRENT_MERGE_CHECKS = 1 << deterministicRandom()->randomInt(0, 7); + init( BLOB_MANIFEST_BACKUP, false ); + init( BLOB_FULL_RESTORE_MODE, false ); init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 ); init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 ); diff --git a/fdbclient/SimpleConfigTransaction.actor.cpp b/fdbclient/SimpleConfigTransaction.actor.cpp index cea49019bc..dba5d327b7 100644 --- a/fdbclient/SimpleConfigTransaction.actor.cpp +++ b/fdbclient/SimpleConfigTransaction.actor.cpp @@ -43,11 +43,13 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetGenerationReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetGenerationRequest{}, + retryGetReplyFromHostname(ConfigTransactionGetGenerationRequest{ 0, Optional() }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GETGENERATION))); } else { - wait(store(reply, retryBrokenPromise(self->cti.getGeneration, ConfigTransactionGetGenerationRequest{}))); + wait(store(reply, + retryBrokenPromise(self->cti.getGeneration, + ConfigTransactionGetGenerationRequest{ 0, Optional() }))); } if (self->dID.present()) { TraceEvent("SimpleConfigTransactionGotReadVersion", self->dID.get()) @@ -70,11 +72,12 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetRequest{ generation, configKey }, + retryGetReplyFromHostname(ConfigTransactionGetRequest{ 0, generation, configKey }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GET))); } else { - wait(store(reply, retryBrokenPromise(self->cti.get, ConfigTransactionGetRequest{ generation, configKey }))); + wait(store(reply, + retryBrokenPromise(self->cti.get, ConfigTransactionGetRequest{ 0, generation, configKey }))); } if (self->dID.present()) { TraceEvent("SimpleConfigTransactionGotValue", self->dID.get()) @@ -95,13 +98,13 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetConfigClassesReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetConfigClassesRequest{ generation }, + retryGetReplyFromHostname(ConfigTransactionGetConfigClassesRequest{ 0, generation }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GETCLASSES))); } else { wait(store( reply, - retryBrokenPromise(self->cti.getClasses, ConfigTransactionGetConfigClassesRequest{ generation }))); + retryBrokenPromise(self->cti.getClasses, ConfigTransactionGetConfigClassesRequest{ 0, generation }))); } RangeResult result; for (const auto& configClass : reply.configClasses) { @@ -118,13 +121,13 @@ class SimpleConfigTransactionImpl { state ConfigTransactionGetKnobsReply reply; if (self->cti.hostname.present()) { wait(store(reply, - retryGetReplyFromHostname(ConfigTransactionGetKnobsRequest{ generation, configClass }, + retryGetReplyFromHostname(ConfigTransactionGetKnobsRequest{ 0, generation, configClass }, self->cti.hostname.get(), WLTOKEN_CONFIGTXN_GETKNOBS))); } else { - wait(store( - reply, - retryBrokenPromise(self->cti.getKnobs, ConfigTransactionGetKnobsRequest{ generation, configClass }))); + wait(store(reply, + retryBrokenPromise(self->cti.getKnobs, + ConfigTransactionGetKnobsRequest{ 0, generation, configClass }))); } RangeResult result; for (const auto& knobName : reply.knobNames) { @@ -137,6 +140,7 @@ class SimpleConfigTransactionImpl { if (!self->getGenerationFuture.isValid()) { self->getGenerationFuture = getGeneration(self); } + self->toCommit.coordinatorsHash = 0; wait(store(self->toCommit.generation, self->getGenerationFuture)); self->toCommit.annotation.timestamp = now(); if (self->cti.hostname.present()) { diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 85f237cb6d..698d15b38e 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -21,10 +21,12 @@ #include "boost/lexical_cast.hpp" #include "boost/algorithm/string.hpp" +#include #include #include #include +#include #include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" @@ -56,65 +58,46 @@ static bool isAlphaNumeric(const std::string& key) { } // namespace std::unordered_map SpecialKeySpace::moduleToBoundary = { - { SpecialKeySpace::MODULE::TRANSACTION, - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")) }, + { SpecialKeySpace::MODULE::TRANSACTION, KeyRangeRef("\xff\xff/transaction/"_sr, "\xff\xff/transaction0"_sr) }, { SpecialKeySpace::MODULE::WORKERINTERFACE, - KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")) }, - { SpecialKeySpace::MODULE::STATUSJSON, singleKeyRange(LiteralStringRef("\xff\xff/status/json")) }, - { SpecialKeySpace::MODULE::CONNECTIONSTRING, singleKeyRange(LiteralStringRef("\xff\xff/connection_string")) }, - { SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange(LiteralStringRef("\xff\xff/cluster_file_path")) }, - { SpecialKeySpace::MODULE::METRICS, - KeyRangeRef(LiteralStringRef("\xff\xff/metrics/"), LiteralStringRef("\xff\xff/metrics0")) }, - { SpecialKeySpace::MODULE::MANAGEMENT, - KeyRangeRef(LiteralStringRef("\xff\xff/management/"), LiteralStringRef("\xff\xff/management0")) }, - { SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange(LiteralStringRef("\xff\xff/error_message")) }, - { SpecialKeySpace::MODULE::CONFIGURATION, - KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) }, - { SpecialKeySpace::MODULE::GLOBALCONFIG, - KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) }, - { SpecialKeySpace::MODULE::TRACING, - KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }, - { SpecialKeySpace::MODULE::ACTORLINEAGE, - KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }, + KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr) }, + { SpecialKeySpace::MODULE::STATUSJSON, singleKeyRange("\xff\xff/status/json"_sr) }, + { SpecialKeySpace::MODULE::CONNECTIONSTRING, singleKeyRange("\xff\xff/connection_string"_sr) }, + { SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange("\xff\xff/cluster_file_path"_sr) }, + { SpecialKeySpace::MODULE::METRICS, KeyRangeRef("\xff\xff/metrics/"_sr, "\xff\xff/metrics0"_sr) }, + { SpecialKeySpace::MODULE::MANAGEMENT, KeyRangeRef("\xff\xff/management/"_sr, "\xff\xff/management0"_sr) }, + { SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange("\xff\xff/error_message"_sr) }, + { SpecialKeySpace::MODULE::CONFIGURATION, KeyRangeRef("\xff\xff/configuration/"_sr, "\xff\xff/configuration0"_sr) }, + { SpecialKeySpace::MODULE::GLOBALCONFIG, KeyRangeRef("\xff\xff/global_config/"_sr, "\xff\xff/global_config0"_sr) }, + { SpecialKeySpace::MODULE::TRACING, KeyRangeRef("\xff\xff/tracing/"_sr, "\xff\xff/tracing0"_sr) }, + { SpecialKeySpace::MODULE::ACTORLINEAGE, KeyRangeRef("\xff\xff/actor_lineage/"_sr, "\xff\xff/actor_lineage0"_sr) }, { SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF, - KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"), - LiteralStringRef("\xff\xff/actor_profiler_conf0")) }, - { SpecialKeySpace::MODULE::CLUSTERID, singleKeyRange(LiteralStringRef("\xff\xff/cluster_id")) }, + KeyRangeRef("\xff\xff/actor_profiler_conf/"_sr, "\xff\xff/actor_profiler_conf0"_sr) }, + { SpecialKeySpace::MODULE::CLUSTERID, singleKeyRange("\xff\xff/cluster_id"_sr) }, }; std::unordered_map SpecialKeySpace::managementApiCommandToRange = { - { "exclude", - KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "failed", - KeyRangeRef(LiteralStringRef("failed/"), LiteralStringRef("failed0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "exclude", KeyRangeRef("excluded/"_sr, "excluded0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "failed", KeyRangeRef("failed/"_sr, "failed0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "excludedlocality", - KeyRangeRef(LiteralStringRef("excluded_locality/"), LiteralStringRef("excluded_locality0")) + KeyRangeRef("excluded_locality/"_sr, "excluded_locality0"_sr) .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "failedlocality", - KeyRangeRef(LiteralStringRef("failed_locality/"), LiteralStringRef("failed_locality0")) + KeyRangeRef("failed_locality/"_sr, "failed_locality0"_sr) .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "lock", singleKeyRange(LiteralStringRef("db_locked")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "lock", singleKeyRange("db_locked"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "consistencycheck", - singleKeyRange(LiteralStringRef("consistency_check_suspended")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + singleKeyRange("consistency_check_suspended"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "coordinators", - KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0")) - .withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) }, + KeyRangeRef("coordinators/"_sr, "coordinators0"_sr).withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) }, { "advanceversion", - singleKeyRange(LiteralStringRef("min_required_commit_version")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "versionepoch", - singleKeyRange(LiteralStringRef("version_epoch")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, - { "profile", - KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + singleKeyRange("min_required_commit_version"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "versionepoch", singleKeyRange("version_epoch"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "profile", KeyRangeRef("profiling/"_sr, "profiling0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "maintenance", - KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + KeyRangeRef("maintenance/"_sr, "maintenance0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "datadistribution", - KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) + KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "tenant", KeyRangeRef("tenant/"_sr, "tenant0"_sr).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "tenantmap", @@ -122,18 +105,15 @@ std::unordered_map SpecialKeySpace::managementApiCommandT }; std::unordered_map SpecialKeySpace::actorLineageApiCommandToRange = { - { "state", - KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0")) - .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }, - { "time", - KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0")) - .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) } + { "state", KeyRangeRef("state/"_sr, "state0"_sr).withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }, + { "time", KeyRangeRef("time/"_sr, "time0"_sr).withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) } }; std::set SpecialKeySpace::options = { "excluded/force", "failed/force", "excluded_locality/force", - "failed_locality/force" }; + "failed_locality/force", + "worker_interfaces/verify" }; std::set SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey }; @@ -148,13 +128,18 @@ RangeResult rywGetRange(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr, c ACTOR Future moveKeySelectorOverRangeActor(const SpecialKeyRangeReadImpl* skrImpl, ReadYourWritesTransaction* ryw, KeySelector* ks, - Optional* cache) { + KeyRangeMap>* cache) { // should be removed before calling ASSERT(!ks->orEqual); // never being called if KeySelector is already normalized ASSERT(ks->offset != 1); + // Throw error if module doesn't support tenants and we have a tenant + if (ryw->getTenant().present() && !skrImpl->supportsTenants()) { + throw illegal_tenant_access(); + } + state Key startKey(skrImpl->getKeyRange().begin); state Key endKey(skrImpl->getKeyRange().end); state RangeResult result; @@ -234,7 +219,7 @@ ACTOR Future normalizeKeySelectorActor(SpecialKeySpace* sks, KeyRangeRef boundary, int* actualOffset, RangeResult* result, - Optional* cache) { + KeyRangeMap>* cache) { // If offset < 1, where we need to move left, iter points to the range containing at least one smaller key // (It's a wasting of time to walk through the range whose begin key is same as ks->key) // (rangeContainingKeyBefore itself handles the case where ks->key == Key()) @@ -319,7 +304,9 @@ ACTOR Future SpecialKeySpace::checkRYWValid(SpecialKeySpace* sks, wait(SpecialKeySpace::getRangeAggregationActor(sks, ryw, begin, end, limits, reverse))) { return result; } - when(wait(ryw->resetFuture())) { throw internal_error(); } + when(wait(ryw->resetFuture())) { + throw internal_error(); + } } } @@ -337,8 +324,9 @@ ACTOR Future SpecialKeySpace::getRangeAggregationActor(SpecialKeySp state int actualBeginOffset; state int actualEndOffset; state KeyRangeRef moduleBoundary; - // used to cache result from potential first read - state Optional cache; + // used to cache results from potential first async read + // the current implementation will read the whole range result to save in the cache + state KeyRangeMap> cache(Optional(), specialKeys.end); if (ryw->specialKeySpaceRelaxed()) { moduleBoundary = sks->range; @@ -364,16 +352,31 @@ ACTOR Future SpecialKeySpace::getRangeAggregationActor(SpecialKeySp // Handle all corner cases like what RYW does // return if range inverted if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) { - TEST(true); // inverted range + CODE_PROBE(true, "inverted range"); return RangeResultRef(false, false); } // If touches begin or end, return with readToBegin and readThroughEnd flags if (begin.getKey() == moduleBoundary.end || end.getKey() == moduleBoundary.begin) { - TEST(true); // query touches begin or end + CODE_PROBE(true, "query touches begin or end"); return result; } state RangeMap::Ranges ranges = sks->getReadImpls().intersectingRanges(KeyRangeRef(begin.getKey(), end.getKey())); + + // Check tenant legality separately from below iterations + // because it may be partially completed and returned + // before illegal range is checked due to the limits handler + if (ryw->getTenant().present()) { + for (auto iter : ranges) { + if (iter->value() == nullptr) { + continue; + } + if (!iter->value()->supportsTenants()) { + throw illegal_tenant_access(); + } + } + } + // TODO : workaround to write this two together to make the code compact // The issue here is boost::iterator_range<> doest not provide rbegin(), rend() iter = reverse ? ranges.end() : ranges.begin(); @@ -385,7 +388,7 @@ ACTOR Future SpecialKeySpace::getRangeAggregationActor(SpecialKeySp KeyRangeRef kr = iter->range(); KeyRef keyStart = kr.contains(begin.getKey()) ? begin.getKey() : kr.begin; KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end; - if (iter->value()->isAsync() && cache.present()) { + if (iter->value()->isAsync() && cache.rangeContaining(keyStart).value().present()) { const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast(iter->value()); RangeResult pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), limits, &cache)); pairs = pairs_; @@ -416,7 +419,7 @@ ACTOR Future SpecialKeySpace::getRangeAggregationActor(SpecialKeySp KeyRangeRef kr = iter->range(); KeyRef keyStart = kr.contains(begin.getKey()) ? begin.getKey() : kr.begin; KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end; - if (iter->value()->isAsync() && cache.present()) { + if (iter->value()->isAsync() && cache.rangeContaining(keyStart).value().present()) { const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast(iter->value()); RangeResult pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), limits, &cache)); pairs = pairs_; @@ -453,7 +456,7 @@ Future SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw, if (!limits.isValid()) return range_limits_invalid(); if (limits.isReached()) { - TEST(true); // read limit 0 + CODE_PROBE(true, "Special Key Space range read limit 0"); return RangeResult(); } // make sure orEqual == false @@ -461,7 +464,7 @@ Future SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw, end.removeOrEqual(end.arena()); if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) { - TEST(true); // range inverted + CODE_PROBE(true, "range inverted"); return RangeResult(); } @@ -499,6 +502,9 @@ void SpecialKeySpace::set(ReadYourWritesTransaction* ryw, const KeyRef& key, con .detail("Value", value.toString()); throw special_keys_no_write_module_found(); } + if (!impl->supportsTenants() && ryw->getTenant().present()) { + throw illegal_tenant_access(); + } return impl->set(ryw, key, value); } @@ -516,6 +522,9 @@ void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& r TraceEvent(SevDebug, "SpecialKeySpaceNoWriteModuleFound").detail("Range", range); throw special_keys_no_write_module_found(); } + if (!begin->supportsTenants() && ryw->getTenant().present()) { + throw illegal_tenant_access(); + } return begin->clear(ryw, range); } @@ -525,6 +534,9 @@ void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) { auto impl = writeImpls[key]; if (impl == nullptr) throw special_keys_no_write_module_found(); + if (!impl->supportsTenants() && ryw->getTenant().present()) { + throw illegal_tenant_access(); + } return impl->clear(ryw, key); } @@ -536,8 +548,8 @@ bool validateSnakeCaseNaming(const KeyRef& k) { // Suffix can be \xff\xff or \x00 in single key range if (key.endsWith(specialKeys.begin)) key = key.removeSuffix(specialKeys.end); - else if (key.endsWith(LiteralStringRef("\x00"))) - key = key.removeSuffix(LiteralStringRef("\x00")); + else if (key.endsWith("\x00"_sr)) + key = key.removeSuffix("\x00"_sr); for (const char& c : key.toString()) { // only small letters, numbers, '/', '_' is allowed ASSERT((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '/' || c == '_'); @@ -612,6 +624,16 @@ ACTOR Future commitActor(SpecialKeySpace* sks, ReadYourWritesTransaction* ++iter; } state std::vector::const_iterator it; + // Check validity of tenant support before iterating through + // module ptrs and potentially getting partial commits + if (ryw->getTenant().present()) { + for (it = writeModulePtrs.begin(); it != writeModulePtrs.end(); ++it) { + if (!(*it)->supportsTenants()) { + throw illegal_tenant_access(); + } + } + } + for (it = writeModulePtrs.begin(); it != writeModulePtrs.end(); ++it) { Optional msg = wait((*it)->commit(ryw)); if (msg.present()) { @@ -629,12 +651,8 @@ Future SpecialKeySpace::commit(ReadYourWritesTransaction* ryw) { return commitActor(this, ryw); } -SKSCTestImpl::SKSCTestImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {} - -Future SKSCTestImpl::getRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) const { - ASSERT(range.contains(kr)); +// For SKSCTestRWImpl and SKSCTestAsyncReadImpl +Future SKSCTestGetRangeBase(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) { auto resultFuture = ryw->getRange(kr, CLIENT_KNOBS->TOO_MANY); // all keys are written to RYW, since GRV is set, the read should happen locally ASSERT(resultFuture.isReady()); @@ -644,11 +662,29 @@ Future SKSCTestImpl::getRange(ReadYourWritesTransaction* ryw, return rywGetRange(ryw, kr, kvs); } -Future> SKSCTestImpl::commit(ReadYourWritesTransaction* ryw) { +SKSCTestRWImpl::SKSCTestRWImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {} + +Future SKSCTestRWImpl::getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const { + ASSERT(range.contains(kr)); + return SKSCTestGetRangeBase(ryw, kr, limitsHint); +} + +Future> SKSCTestRWImpl::commit(ReadYourWritesTransaction* ryw) { ASSERT(false); return Optional(); } +SKSCTestAsyncReadImpl::SKSCTestAsyncReadImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {} + +Future SKSCTestAsyncReadImpl::getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const { + ASSERT(range.contains(kr)); + return SKSCTestGetRangeBase(ryw, kr, limitsHint); +} + ReadConflictRangeImpl::ReadConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} ACTOR static Future getReadConflictRangeImpl(ReadYourWritesTransaction* ryw, KeyRange kr) { @@ -679,13 +715,14 @@ Future ConflictingKeysImpl::getRange(ReadYourWritesTransaction* ryw if (ryw->getTransactionState()->conflictingKeys) { auto krMapPtr = ryw->getTransactionState()->conflictingKeys.get(); auto beginIter = krMapPtr->rangeContaining(kr.begin); - if (beginIter->begin() != kr.begin) - ++beginIter; auto endIter = krMapPtr->rangeContaining(kr.end); + + if (!kr.contains(beginIter->begin()) && beginIter != endIter) + ++beginIter; for (auto it = beginIter; it != endIter; ++it) { result.push_back_deep(result.arena(), KeyValueRef(it->begin(), it->value())); } - if (endIter->begin() != kr.end) + if (kr.contains(endIter->begin())) result.push_back_deep(result.arena(), KeyValueRef(endIter->begin(), endIter->value())); } return result; @@ -695,8 +732,8 @@ ACTOR Future ddMetricsGetRangeActor(ReadYourWritesTransaction* ryw, loop { try { auto keys = kr.removePrefix(ddStatsRange.begin); - Standalone> resultWithoutPrefix = wait( - waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)); + Standalone> resultWithoutPrefix = + wait(waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->TOO_MANY)); RangeResult result; for (const auto& ddMetricsRef : resultWithoutPrefix) { // each begin key is the previous end key, thus we only encode the begin key in the result @@ -732,7 +769,7 @@ Future DDStatsRangeImpl::getRange(ReadYourWritesTransaction* ryw, } Key SpecialKeySpace::getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option) { - Key prefix = LiteralStringRef("options/").withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin); + Key prefix = "options/"_sr.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin); auto pair = command + "/" + option; ASSERT(options.find(pair) != options.end()); return prefix.withSuffix(pair); @@ -863,11 +900,11 @@ void ExcludeServersRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& Key ExcludeServersRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key ExcludeServersRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -957,21 +994,13 @@ ACTOR Future checkExclusion(Database db, state int ssTotalCount = 0; state int ssExcludedCount = 0; - state double worstFreeSpaceRatio = 1.0; + + state std::unordered_set diskLocalities; + state int64_t totalKvStoreFreeBytes = 0; + state int64_t totalKvStoreUsedBytes = 0; + state int64_t totalKvStoreUsedBytesNonExcluded = 0; try { for (auto proc : processesMap.obj()) { - bool storageServer = false; - StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); - for (StatusObjectReader role : rolesArray) { - if (role["role"].get_str() == "storage") { - storageServer = true; - break; - } - } - // Skip non-storage servers in free space calculation - if (!storageServer) - continue; - StatusObjectReader process(proc.second); std::string addrStr; if (!process.get("address", addrStr)) { @@ -981,33 +1010,49 @@ ACTOR Future checkExclusion(Database db, NetworkAddress addr = NetworkAddress::parse(addrStr); bool excluded = (process.has("excluded") && process.last().get_bool()) || addressExcluded(*exclusions, addr); - ssTotalCount++; - if (excluded) - ssExcludedCount++; - if (!excluded) { - StatusObjectReader disk; - if (!process.get("disk", disk)) { - *msg = - ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString); - return false; + StatusObjectReader localityObj; + std::string disk_id; + if (process.get("locality", localityObj)) { + process.get("disk_id", disk_id); // its ok if we don't have this field + } + + StatusArray rolesArray = proc.second.get_obj()["roles"].get_array(); + for (StatusObjectReader role : rolesArray) { + if (role["role"].get_str() == "storage") { + ssTotalCount++; + + int64_t used_bytes; + if (!role.get("kvstore_used_bytes", used_bytes)) { + *msg = ManagementAPIError::toJsonString( + false, markFailed ? "exclude failed" : "exclude", errorString); + return false; + } + + int64_t free_bytes; + if (!role.get("kvstore_free_bytes", free_bytes)) { + *msg = ManagementAPIError::toJsonString( + false, markFailed ? "exclude failed" : "exclude", errorString); + return false; + } + + totalKvStoreUsedBytes += used_bytes; + + if (!excluded) { + totalKvStoreUsedBytesNonExcluded += used_bytes; + + if (disk_id.empty() || diskLocalities.find(disk_id) == diskLocalities.end()) { + totalKvStoreFreeBytes += free_bytes; + if (!disk_id.empty()) { + diskLocalities.insert(disk_id); + } + } + } } - int64_t total_bytes; - if (!disk.get("total_bytes", total_bytes)) { - *msg = - ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString); - return false; + if (excluded) { + ssExcludedCount++; } - - int64_t free_bytes; - if (!disk.get("free_bytes", free_bytes)) { - *msg = - ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString); - return false; - } - - worstFreeSpaceRatio = std::min(worstFreeSpaceRatio, double(free_bytes) / total_bytes); } } } catch (...) // std::exception @@ -1016,14 +1061,15 @@ ACTOR Future checkExclusion(Database db, return false; } - if (ssExcludedCount == ssTotalCount || - (1 - worstFreeSpaceRatio) * ssTotalCount / (ssTotalCount - ssExcludedCount) > 0.9) { + double finalFreeRatio = 1 - (totalKvStoreUsedBytes / (totalKvStoreUsedBytesNonExcluded + totalKvStoreFreeBytes)); + if (ssExcludedCount == ssTotalCount || finalFreeRatio <= 0.1) { std::string temp = "ERROR: This exclude may cause the total free space in the cluster to drop below 10%.\n" "Call set(\"0xff0xff/management/options/exclude/force\", ...) first to exclude without " "checking free space.\n"; *msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", temp); return false; } + return true; } @@ -1106,11 +1152,11 @@ void FailedServersRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& k Key FailedServersRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key FailedServersRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -1272,8 +1318,7 @@ Future> ProcessClassRangeImpl::commit(ReadYourWritesTransa // validate class type ValueRef processClassType = entry.second.get(); ProcessClass processClass(processClassType.toString(), ProcessClass::DBSource); - if (processClass.classType() == ProcessClass::InvalidClass && - processClassType != LiteralStringRef("default")) { + if (processClass.classType() == ProcessClass::InvalidClass && processClassType != "default"_sr) { std::string error = "ERROR: \'" + processClassType.toString() + "\' is not a valid process class\n"; errorMsg = ManagementAPIError::toJsonString(false, "setclass", error); return errorMsg; @@ -1373,11 +1418,10 @@ ACTOR Future> lockDatabaseCommitActor(ReadYourWritesTransa throw database_locked(); } else if (!val.present()) { // lock database - ryw->getTransaction().atomicOp(databaseLockedKey, - BinaryWriter::toValue(uid, Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), - MutationRef::SetVersionstampedValue); + ryw->getTransaction().atomicOp( + databaseLockedKey, + BinaryWriter::toValue(uid, Unversioned()).withPrefix("0123456789"_sr).withSuffix("\x00\x00\x00\x00"_sr), + MutationRef::SetVersionstampedValue); ryw->getTransaction().addWriteConflictRange(normalKeys); } @@ -1587,7 +1631,8 @@ Future TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) { if (ryw->getApproximateSize() > 0) { - ryw->setSpecialKeySpaceErrorMsg("tracing options must be set first"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "configure trace", "tracing options must be set first")); ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional())); return; } @@ -1600,7 +1645,8 @@ void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, } else if (value.toString() == "false") { ryw->setToken(0); } else { - ryw->setSpecialKeySpaceErrorMsg("token must be set to true/false"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "configure trace token", "token must be set to true/false")); throw special_keys_api_failure(); } } @@ -1614,12 +1660,12 @@ Future> TracingOptionsImpl::commit(ReadYourWritesTransacti } void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) { - ryw->setSpecialKeySpaceErrorMsg("clear range disabled"); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(false, "clear trace", "clear range disabled")); throw special_keys_api_failure(); } void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) { - ryw->setSpecialKeySpaceErrorMsg("clear disabled"); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(false, "clear trace", "clear disabled")); throw special_keys_api_failure(); } @@ -1629,7 +1675,7 @@ ACTOR Future coordinatorsGetRangeActor(ReadYourWritesTransaction* r state ClusterConnectionString cs = ryw->getDatabase()->getConnectionRecord()->getConnectionString(); state std::vector coordinator_processes = wait(cs.tryResolveHostnames()); RangeResult result; - Key cluster_decription_key = prefix.withSuffix(LiteralStringRef("cluster_description")); + Key cluster_decription_key = prefix.withSuffix("cluster_description"_sr); if (kr.contains(cluster_decription_key)) { result.push_back_deep(result.arena(), KeyValueRef(cluster_decription_key, cs.clusterKeyName())); } @@ -1644,7 +1690,7 @@ ACTOR Future coordinatorsGetRangeActor(ReadYourWritesTransaction* r processes_str += ","; processes_str += w.toString(); } - Key processes_key = prefix.withSuffix(LiteralStringRef("processes")); + Key processes_key = prefix.withSuffix("processes"_sr); if (kr.contains(processes_key)) { result.push_back_deep(result.arena(), KeyValueRef(processes_key, Value(processes_str))); } @@ -1666,7 +1712,7 @@ ACTOR static Future> coordinatorsCommitActor(ReadYourWrite state bool parse_error = false; // check update for coordinators - Key processes_key = LiteralStringRef("processes").withPrefix(kr.begin); + Key processes_key = "processes"_sr.withPrefix(kr.begin); auto processes_entry = ryw->getSpecialKeySpaceWriteMap()[processes_key]; if (processes_entry.first) { ASSERT(processes_entry.second.present()); // no clear should be seen here @@ -1706,7 +1752,7 @@ ACTOR static Future> coordinatorsCommitActor(ReadYourWrite std::string newName; // check update for cluster_description - Key cluster_decription_key = LiteralStringRef("cluster_description").withPrefix(kr.begin); + Key cluster_decription_key = "cluster_description"_sr.withPrefix(kr.begin); auto entry = ryw->getSpecialKeySpaceWriteMap()[cluster_decription_key]; if (entry.first) { // check valid description [a-zA-Z0-9_]+ @@ -1720,11 +1766,15 @@ ACTOR static Future> coordinatorsCommitActor(ReadYourWrite } } + auto configDBEntry = ryw->getSpecialKeySpaceWriteMap()["config_db"_sr.withPrefix(kr.begin)]; + TraceEvent(SevDebug, "SKSChangeCoordinatorsStart") .detail("NewConnectionString", conn.toString()) - .detail("Description", entry.first ? entry.second.get().toString() : ""); + .detail("Description", entry.first ? entry.second.get().toString() : "") + .detail("ConfigDBDisabled", configDBEntry.first); - Optional r = wait(changeQuorumChecker(&ryw->getTransaction(), &conn, newName)); + Optional r = + wait(changeQuorumChecker(&ryw->getTransaction(), &conn, newName, configDBEntry.first)); TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish") .detail("Result", r.present() ? static_cast(r.get()) : -1); // -1 means success @@ -1947,7 +1997,7 @@ Future ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw KeyRef prefix = getKeyRange().begin; RangeResult result = RangeResult(); // client_txn_sample_rate - Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(prefix); + Key sampleRateKey = "client_txn_sample_rate"_sr.withPrefix(prefix); ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS); @@ -1968,7 +2018,7 @@ Future ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw } } // client_txn_size_limit - Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(prefix); + Key txnSizeLimitKey = "client_txn_size_limit"_sr.withPrefix(prefix); if (kr.contains(txnSizeLimitKey)) { auto entry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey]; if (!ryw->readYourWritesDisabled() && entry.first) { @@ -1994,7 +2044,7 @@ Future> ClientProfilingImpl::commit(ReadYourWritesTransact Standalone> clears; // client_txn_sample_rate - Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(getKeyRange().begin); + Key sampleRateKey = "client_txn_sample_rate"_sr.withPrefix(getKeyRange().begin); auto rateEntry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey]; if (rateEntry.first && rateEntry.second.present()) { @@ -2005,7 +2055,7 @@ Future> ClientProfilingImpl::commit(ReadYourWritesTransact } else { try { double sampleRate = boost::lexical_cast(sampleRateStr); - Tuple rate = Tuple().appendDouble(sampleRate); + Tuple rate = Tuple::makeTuple(sampleRate); insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSampleRate, rate.pack())); } catch (boost::bad_lexical_cast& e) { return Optional(ManagementAPIError::toJsonString( @@ -2014,7 +2064,7 @@ Future> ClientProfilingImpl::commit(ReadYourWritesTransact } } // client_txn_size_limit - Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(getKeyRange().begin); + Key txnSizeLimitKey = "client_txn_size_limit"_sr.withPrefix(getKeyRange().begin); auto sizeLimitEntry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey]; if (sizeLimitEntry.first && sizeLimitEntry.second.present()) { std::string sizeLimitStr = sizeLimitEntry.second.get().toString(); @@ -2024,7 +2074,7 @@ Future> ClientProfilingImpl::commit(ReadYourWritesTransact } else { try { int64_t sizeLimit = boost::lexical_cast(sizeLimitStr); - Tuple size = Tuple().append(sizeLimit); + Tuple size = Tuple::makeTuple(sizeLimit); insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSizeLimit, size.pack())); } catch (boost::bad_lexical_cast& e) { return Optional(ManagementAPIError::toJsonString( @@ -2059,11 +2109,11 @@ void parse(StringRef& val, double& d) { } void parse(StringRef& val, WaitState& w) { - if (val == LiteralStringRef("disk") || val == LiteralStringRef("Disk")) { + if (val == "disk"_sr || val == "Disk"_sr) { w = WaitState::Disk; - } else if (val == LiteralStringRef("network") || val == LiteralStringRef("Network")) { + } else if (val == "network"_sr || val == "Network"_sr) { w = WaitState::Network; - } else if (val == LiteralStringRef("running") || val == LiteralStringRef("Running")) { + } else if (val == "running"_sr || val == "Running"_sr) { w = WaitState::Running; } else { throw std::range_error("failed to parse run state"); @@ -2164,7 +2214,8 @@ ACTOR static Future actorLineageGetRangeActor(ReadYourWritesTransac state std::vector endValues = kr.end.removePrefix(prefix).splitAny("/"_sr); // Require index (either "state" or "time") and address:port. if (beginValues.size() < 2 || endValues.size() < 2) { - ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "read actor_lineage", "missing required parameters (index, host)")); throw special_keys_api_failure(); } @@ -2183,12 +2234,14 @@ ACTOR static Future actorLineageGetRangeActor(ReadYourWritesTransac parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd); } } else { - ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "read actor_lineage", "invalid index in actor_lineage")); throw special_keys_api_failure(); } } catch (Error& e) { if (e.code() != special_keys_api_failure().code()) { - ryw->setSpecialKeySpaceErrorMsg("failed to parse key"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "read actor_lineage", "failed to parse key")); throw special_keys_api_failure(); } else { throw e; @@ -2198,7 +2251,8 @@ ACTOR static Future actorLineageGetRangeActor(ReadYourWritesTransac if (kr.begin != kr.end && host != endRangeHost) { // The client doesn't know about all the hosts, so a get range covering // multiple hosts has no way of knowing which IP:port combos to use. - ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range"); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString( + false, "read actor_lineage", "the host must remain the same on both ends of the range")); throw special_keys_api_failure(); } @@ -2459,7 +2513,7 @@ ACTOR static Future DataDistributionGetRangeActor(ReadYourWritesTra KeyRangeRef kr) { state RangeResult result; // dataDistributionModeKey - state Key modeKey = LiteralStringRef("mode").withPrefix(prefix); + state Key modeKey = "mode"_sr.withPrefix(prefix); ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS); @@ -2475,7 +2529,7 @@ ACTOR static Future DataDistributionGetRangeActor(ReadYourWritesTra } } // rebalanceDDIgnoreKey - state Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(prefix); + state Key rebalanceIgnoredKey = "rebalance_ignored"_sr.withPrefix(prefix); if (kr.contains(rebalanceIgnoredKey)) { auto entry = ryw->getSpecialKeySpaceWriteMap()[rebalanceIgnoredKey]; if (ryw->readYourWritesDisabled() || !entry.first) { @@ -2502,8 +2556,8 @@ Future> DataDistributionImpl::commit(ReadYourWritesTransac Optional msg; KeyRangeRef kr = getKeyRange(); - Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin); - Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin); + Key modeKey = "mode"_sr.withPrefix(kr.begin); + Key rebalanceIgnoredKey = "rebalance_ignored"_sr.withPrefix(kr.begin); auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr); for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { if (!iter->value().first) @@ -2690,11 +2744,11 @@ void ExcludedLocalitiesRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyR Key ExcludedLocalitiesRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key ExcludedLocalitiesRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -2719,11 +2773,11 @@ void FailedLocalitiesRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef Key FailedLocalitiesRangeImpl::decode(const KeyRef& key) const { return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) - .withPrefix(LiteralStringRef("\xff/conf/")); + .withPrefix("\xff/conf/"_sr); } Key FailedLocalitiesRangeImpl::encode(const KeyRef& key) const { - return key.removePrefix(LiteralStringRef("\xff/conf/")) + return key.removePrefix("\xff/conf/"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); } @@ -2731,3 +2785,143 @@ Future> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr // exclude locality with failed option as true. return excludeLocalityCommitActor(ryw, true); } + +// Defined in ReadYourWrites.actor.cpp +ACTOR Future getWorkerInterfaces(Reference clusterRecord); +// Defined in NativeAPI.actor.cpp +ACTOR Future verifyInterfaceActor(Reference connectLock, ClientWorkerInterface workerInterf); + +ACTOR static Future workerInterfacesImplGetRangeActor(ReadYourWritesTransaction* ryw, + KeyRef prefix, + KeyRangeRef kr) { + if (!ryw->getDatabase().getPtr() || !ryw->getDatabase()->getConnectionRecord()) + return RangeResult(); + + state RangeResult interfs = wait(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord())); + // for options' special keys, the boolean flag indicates if it's a SET operation + auto [verify, _] = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandOptionSpecialKey( + "worker_interfaces", "verify")]; + state RangeResult result; + if (verify) { + // if verify option is set, we try to talk to every worker and only returns those we can talk to + Reference connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM)); + state std::vector> verifyInterfs; + for (const auto& [k_, value] : interfs) { + auto k = k_.withPrefix(prefix); + if (kr.contains(k)) { + ClientWorkerInterface workerInterf = + BinaryReader::fromStringRef(value, IncludeVersion()); + verifyInterfs.push_back(verifyInterfaceActor(connectLock, workerInterf)); + } else { + verifyInterfs.push_back(false); + } + } + wait(waitForAll(verifyInterfs)); + // state int index; + for (int index = 0; index < interfs.size(); index++) { + if (verifyInterfs[index].get()) { + // if we can establish a connection, add the kv pair into the result + result.push_back_deep(result.arena(), + KeyValueRef(interfs[index].key.withPrefix(prefix), interfs[index].value)); + } + } + } else { + for (const auto& [k_, v] : interfs) { + auto k = k_.withPrefix(prefix); + if (kr.contains(k)) + result.push_back_deep(result.arena(), KeyValueRef(k, v)); + } + } + std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{}); + return result; +} + +WorkerInterfacesSpecialKeyImpl::WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} + +Future WorkerInterfacesSpecialKeyImpl::getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const { + return workerInterfacesImplGetRangeActor(ryw, getKeyRange().begin, kr); +} + +ACTOR Future validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw, + KeySelector begin, + KeySelector end, + GetRangeLimits limits, + Reverse reverse, + RangeResult result) { + if (!result.size()) { + RangeResult testResult = wait(ryw->getRange(begin, end, limits, Snapshot::True, reverse)); + ASSERT(testResult == result); + return Void(); + } + + if (reverse) { + ASSERT(std::is_sorted(result.begin(), result.end(), KeyValueRef::OrderByKeyBack{})); + } else { + ASSERT(std::is_sorted(result.begin(), result.end(), KeyValueRef::OrderByKey{})); + } + + // Generate a keyrange where we can determine the expected result based solely on the previous readrange, and whose + // boundaries may or may not be keys in result. + std::vector candidateKeys; + if (reverse) { + for (int i = result.size() - 1; i >= 0; --i) { + candidateKeys.emplace_back(result[i].key); + if (i - 1 >= 0) { + candidateKeys.emplace_back(keyBetween(KeyRangeRef(result[i].key, result[i - 1].key))); + } + } + } else { + for (int i = 0; i < result.size(); ++i) { + candidateKeys.emplace_back(result[i].key); + if (i + 1 < result.size()) { + candidateKeys.emplace_back(keyBetween(KeyRangeRef(result[i].key, result[i + 1].key))); + } + } + } + std::sort(candidateKeys.begin(), candidateKeys.end()); + int originalSize = candidateKeys.size(); + // Add more candidate keys so that we might read a range between two adjacent result keys. + for (int i = 0; i < originalSize - 1; ++i) { + candidateKeys.emplace_back(keyBetween(KeyRangeRef(candidateKeys[i], candidateKeys[i + 1]))); + } + std::vector keys; + keys = { deterministicRandom()->randomChoice(candidateKeys), deterministicRandom()->randomChoice(candidateKeys) }; + std::sort(keys.begin(), keys.end()); + state KeySelector testBegin = firstGreaterOrEqual(keys[0]); + state KeySelector testEnd = firstGreaterOrEqual(keys[1]); + + // Generate expected result. Linear time is ok here since we're in simulation, and there's a benefit to keeping this + // simple (as we're using it as an test oracle) + state RangeResult expectedResult; + // The reverse parameter should be the same as for the original read, so if + // reverse is true then the results are _already_ in reverse order. + for (const auto& kr : result) { + if (kr.key >= keys[0] && kr.key < keys[1]) { + expectedResult.push_back(expectedResult.arena(), kr); + } + } + + // Test + RangeResult testResult = wait(ryw->getRange(testBegin, testEnd, limits, Snapshot::True, reverse)); + if (testResult != expectedResult) { + fmt::print("Reverse: {}\n", reverse); + fmt::print("Original range: [{}, {})\n", begin.toString(), end.toString()); + fmt::print("Original result:\n"); + for (const auto& kr : result) { + fmt::print(" {} -> {}\n", kr.key.printable(), kr.value.printable()); + } + fmt::print("Test range: [{}, {})\n", testBegin.getKey().printable(), testEnd.getKey().printable()); + fmt::print("Expected:\n"); + for (const auto& kr : expectedResult) { + fmt::print(" {} -> {}\n", kr.key.printable(), kr.value.printable()); + } + fmt::print("Got:\n"); + for (const auto& kr : testResult) { + fmt::print(" {} -> {}\n", kr.key.printable(), kr.value.printable()); + } + ASSERT(testResult == expectedResult); + } + return Void(); +} diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp index 8a9b6183a4..535ef8f98f 100644 --- a/fdbclient/StorageServerInterface.cpp +++ b/fdbclient/StorageServerInterface.cpp @@ -129,7 +129,7 @@ const char* TSS_mismatchTraceName(const GetKeyValuesRequest& req) { static void traceKeyValuesSummary(TraceEvent& event, const KeySelectorRef& begin, const KeySelectorRef& end, - Optional tenant, + Optional tenant, Version version, int limit, int limitBytes, @@ -152,7 +152,7 @@ static void traceKeyValuesSummary(TraceEvent& event, static void traceKeyValuesDiff(TraceEvent& event, const KeySelectorRef& begin, const KeySelectorRef& end, - Optional tenant, + Optional tenant, Version version, int limit, int limitBytes, diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index c140104335..a720547b56 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -31,20 +31,20 @@ FDB_DEFINE_BOOLEAN_PARAM(AssignEmptyRange); FDB_DEFINE_BOOLEAN_PARAM(UnassignShard); -const KeyRef systemKeysPrefix = LiteralStringRef("\xff"); +const KeyRef systemKeysPrefix = "\xff"_sr; const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix); -const KeyRangeRef systemKeys(systemKeysPrefix, LiteralStringRef("\xff\xff")); -const KeyRangeRef nonMetadataSystemKeys(LiteralStringRef("\xff\x02"), LiteralStringRef("\xff\x03")); +const KeyRangeRef systemKeys(systemKeysPrefix, "\xff\xff"_sr); +const KeyRangeRef nonMetadataSystemKeys("\xff\x02"_sr, "\xff\x03"_sr); const KeyRangeRef allKeys = KeyRangeRef(normalKeys.begin, systemKeys.end); -const KeyRef afterAllKeys = LiteralStringRef("\xff\xff\x00"); -const KeyRangeRef specialKeys = KeyRangeRef(LiteralStringRef("\xff\xff"), LiteralStringRef("\xff\xff\xff")); +const KeyRef afterAllKeys = "\xff\xff\x00"_sr; +const KeyRangeRef specialKeys = KeyRangeRef("\xff\xff"_sr, "\xff\xff\xff"_sr); // keyServersKeys.contains(k) iff k.startsWith(keyServersPrefix) -const KeyRangeRef keyServersKeys(LiteralStringRef("\xff/keyServers/"), LiteralStringRef("\xff/keyServers0")); +const KeyRangeRef keyServersKeys("\xff/keyServers/"_sr, "\xff/keyServers0"_sr); const KeyRef keyServersPrefix = keyServersKeys.begin; const KeyRef keyServersEnd = keyServersKeys.end; -const KeyRangeRef keyServersKeyServersKeys(LiteralStringRef("\xff/keyServers/\xff/keyServers/"), - LiteralStringRef("\xff/keyServers/\xff/keyServers0")); +const KeyRangeRef keyServersKeyServersKeys("\xff/keyServers/\xff/keyServers/"_sr, + "\xff/keyServers/\xff/keyServers0"_sr); const KeyRef keyServersKeyServersKey = keyServersKeyServersKeys.begin; // These constants are selected to be easily recognized during debugging. @@ -274,20 +274,17 @@ void decodeKeyServersValue(std::map const& tag_uid, } const KeyRangeRef conflictingKeysRange = - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/conflicting_keys/"), - LiteralStringRef("\xff\xff/transaction/conflicting_keys/\xff\xff")); -const ValueRef conflictingKeysTrue = LiteralStringRef("1"); -const ValueRef conflictingKeysFalse = LiteralStringRef("0"); + KeyRangeRef("\xff\xff/transaction/conflicting_keys/"_sr, "\xff\xff/transaction/conflicting_keys/\xff\xff"_sr); +const ValueRef conflictingKeysTrue = "1"_sr; +const ValueRef conflictingKeysFalse = "0"_sr; const KeyRangeRef readConflictRangeKeysRange = - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/read_conflict_range/"), - LiteralStringRef("\xff\xff/transaction/read_conflict_range/\xff\xff")); + KeyRangeRef("\xff\xff/transaction/read_conflict_range/"_sr, "\xff\xff/transaction/read_conflict_range/\xff\xff"_sr); -const KeyRangeRef writeConflictRangeKeysRange = - KeyRangeRef(LiteralStringRef("\xff\xff/transaction/write_conflict_range/"), - LiteralStringRef("\xff\xff/transaction/write_conflict_range/\xff\xff")); +const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transaction/write_conflict_range/"_sr, + "\xff\xff/transaction/write_conflict_range/\xff\xff"_sr); -const KeyRef clusterIdKey = LiteralStringRef("\xff/clusterId"); +const KeyRef clusterIdKey = "\xff/clusterId"_sr; const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr; @@ -344,7 +341,7 @@ DataMoveMetaData decodeDataMoveValue(const ValueRef& value) { } // "\xff/cacheServer/[[UID]] := StorageServerInterface" -const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"), LiteralStringRef("\xff/cacheServer0")); +const KeyRangeRef storageCacheServerKeys("\xff/cacheServer/"_sr, "\xff/cacheServer0"_sr); const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin; const KeyRef storageCacheServersEnd = storageCacheServerKeys.end; @@ -356,16 +353,16 @@ const Key storageCacheServerKey(UID id) { } const Value storageCacheServerValue(const StorageServerInterface& ssi) { - auto protocolVersion = currentProtocolVersion; + auto protocolVersion = currentProtocolVersion(); protocolVersion.addObjectSerializerFlag(); return ObjectWriter::toValue(ssi, IncludeVersion(protocolVersion)); } -const KeyRangeRef ddStatsRange = KeyRangeRef(LiteralStringRef("\xff\xff/metrics/data_distribution_stats/"), - LiteralStringRef("\xff\xff/metrics/data_distribution_stats/\xff\xff")); +const KeyRangeRef ddStatsRange = + KeyRangeRef("\xff\xff/metrics/data_distribution_stats/"_sr, "\xff\xff/metrics/data_distribution_stats/\xff\xff"_sr); // "\xff/storageCache/[[begin]]" := "[[vector]]" -const KeyRangeRef storageCacheKeys(LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0")); +const KeyRangeRef storageCacheKeys("\xff/storageCache/"_sr, "\xff/storageCache0"_sr); const KeyRef storageCachePrefix = storageCacheKeys.begin; const Key storageCacheKey(const KeyRef& k) { @@ -427,7 +424,7 @@ const Key serverKeysKey(UID serverID, const KeyRef& key) { BinaryWriter wr(Unversioned()); wr.serializeBytes(serverKeysPrefix); wr << serverID; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); wr.serializeBytes(key); return wr.toValue(); } @@ -435,7 +432,7 @@ const Key serverKeysPrefixFor(UID serverID) { BinaryWriter wr(Unversioned()); wr.serializeBytes(serverKeysPrefix); wr << serverID; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); return wr.toValue(); } UID serverKeysDecodeServer(const KeyRef& key) { @@ -475,18 +472,21 @@ const Value serverKeysValue(const UID& id) { void decodeServerKeysValue(const ValueRef& value, bool& assigned, bool& emptyRange, UID& id) { if (value.size() == 0) { - id = UID(); assigned = false; emptyRange = false; + id = UID(); } else if (value == serverKeysTrue) { assigned = true; emptyRange = false; + id = anonymousShardId; } else if (value == serverKeysTrueEmptyRange) { assigned = true; emptyRange = true; + id = anonymousShardId; } else if (value == serverKeysFalse) { assigned = false; emptyRange = false; + id = UID(); } else { BinaryReader rd(value, IncludeVersion()); ASSERT(rd.protocolVersion().hasShardEncodeLocationMetaData()); @@ -496,13 +496,13 @@ void decodeServerKeysValue(const ValueRef& value, bool& assigned, bool& emptyRan } } -const KeyRef cacheKeysPrefix = LiteralStringRef("\xff\x02/cacheKeys/"); +const KeyRef cacheKeysPrefix = "\xff\x02/cacheKeys/"_sr; const Key cacheKeysKey(uint16_t idx, const KeyRef& key) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cacheKeysPrefix); wr << idx; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); wr.serializeBytes(key); return wr.toValue(); } @@ -510,7 +510,7 @@ const Key cacheKeysPrefixFor(uint16_t idx) { BinaryWriter wr(Unversioned()); wr.serializeBytes(cacheKeysPrefix); wr << idx; - wr.serializeBytes(LiteralStringRef("/")); + wr.serializeBytes("/"_sr); return wr.toValue(); } uint16_t cacheKeysDecodeIndex(const KeyRef& key) { @@ -523,9 +523,8 @@ KeyRef cacheKeysDecodeKey(const KeyRef& key) { return key.substr(cacheKeysPrefix.size() + sizeof(uint16_t) + 1); } -const KeyRef cacheChangeKey = LiteralStringRef("\xff\x02/cacheChangeKey"); -const KeyRangeRef cacheChangeKeys(LiteralStringRef("\xff\x02/cacheChangeKeys/"), - LiteralStringRef("\xff\x02/cacheChangeKeys0")); +const KeyRef cacheChangeKey = "\xff\x02/cacheChangeKey"_sr; +const KeyRangeRef cacheChangeKeys("\xff\x02/cacheChangeKeys/"_sr, "\xff\x02/cacheChangeKeys0"_sr); const KeyRef cacheChangePrefix = cacheChangeKeys.begin; const Key cacheChangeKeyFor(uint16_t idx) { BinaryWriter wr(Unversioned()); @@ -540,9 +539,9 @@ uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key) { return idx; } -const KeyRangeRef tssMappingKeys(LiteralStringRef("\xff/tss/"), LiteralStringRef("\xff/tss0")); +const KeyRangeRef tssMappingKeys("\xff/tss/"_sr, "\xff/tss0"_sr); -const KeyRangeRef tssQuarantineKeys(LiteralStringRef("\xff/tssQ/"), LiteralStringRef("\xff/tssQ0")); +const KeyRangeRef tssQuarantineKeys("\xff/tssQ/"_sr, "\xff/tssQ0"_sr); const Key tssQuarantineKeyFor(UID serverID) { BinaryWriter wr(Unversioned()); @@ -558,22 +557,19 @@ UID decodeTssQuarantineKey(KeyRef const& key) { return serverID; } -const KeyRangeRef tssMismatchKeys(LiteralStringRef("\xff/tssMismatch/"), LiteralStringRef("\xff/tssMismatch0")); +const KeyRangeRef tssMismatchKeys("\xff/tssMismatch/"_sr, "\xff/tssMismatch0"_sr); -const KeyRangeRef serverMetadataKeys(LiteralStringRef("\xff/serverMetadata/"), - LiteralStringRef("\xff/serverMetadata0")); +const KeyRangeRef serverMetadataKeys("\xff/serverMetadata/"_sr, "\xff/serverMetadata0"_sr); -const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0")); +const KeyRangeRef serverTagKeys("\xff/serverTag/"_sr, "\xff/serverTag0"_sr); const KeyRef serverTagPrefix = serverTagKeys.begin; -const KeyRangeRef serverTagConflictKeys(LiteralStringRef("\xff/serverTagConflict/"), - LiteralStringRef("\xff/serverTagConflict0")); +const KeyRangeRef serverTagConflictKeys("\xff/serverTagConflict/"_sr, "\xff/serverTagConflict0"_sr); const KeyRef serverTagConflictPrefix = serverTagConflictKeys.begin; // serverTagHistoryKeys is the old tag a storage server uses before it is migrated to a different location. // For example, we can copy a SS file to a remote DC and start the SS there; // The new SS will need to consume the last bits of data from the old tag it is responsible for. -const KeyRangeRef serverTagHistoryKeys(LiteralStringRef("\xff/serverTagHistory/"), - LiteralStringRef("\xff/serverTagHistory0")); +const KeyRangeRef serverTagHistoryKeys("\xff/serverTagHistory/"_sr, "\xff/serverTagHistory0"_sr); const KeyRef serverTagHistoryPrefix = serverTagHistoryKeys.begin; const Key serverTagKeyFor(UID serverID) { @@ -658,12 +654,11 @@ const Key serverTagConflictKeyFor(Tag tag) { return wr.toValue(); } -const KeyRangeRef tagLocalityListKeys(LiteralStringRef("\xff/tagLocalityList/"), - LiteralStringRef("\xff/tagLocalityList0")); +const KeyRangeRef tagLocalityListKeys("\xff/tagLocalityList/"_sr, "\xff/tagLocalityList0"_sr); const KeyRef tagLocalityListPrefix = tagLocalityListKeys.begin; const Key tagLocalityListKeyFor(Optional dcID) { - BinaryWriter wr(AssumeVersion(currentProtocolVersion)); + BinaryWriter wr(AssumeVersion(currentProtocolVersion())); wr.serializeBytes(tagLocalityListKeys.begin); wr << dcID; return wr.toValue(); @@ -676,7 +671,7 @@ const Value tagLocalityListValue(int8_t const& tagLocality) { } Optional decodeTagLocalityListKey(KeyRef const& key) { Optional dcID; - BinaryReader rd(key.removePrefix(tagLocalityListKeys.begin), AssumeVersion(currentProtocolVersion)); + BinaryReader rd(key.removePrefix(tagLocalityListKeys.begin), AssumeVersion(currentProtocolVersion())); rd >> dcID; return dcID; } @@ -687,12 +682,11 @@ int8_t decodeTagLocalityListValue(ValueRef const& value) { return s; } -const KeyRangeRef datacenterReplicasKeys(LiteralStringRef("\xff\x02/datacenterReplicas/"), - LiteralStringRef("\xff\x02/datacenterReplicas0")); +const KeyRangeRef datacenterReplicasKeys("\xff\x02/datacenterReplicas/"_sr, "\xff\x02/datacenterReplicas0"_sr); const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin; const Key datacenterReplicasKeyFor(Optional dcID) { - BinaryWriter wr(AssumeVersion(currentProtocolVersion)); + BinaryWriter wr(AssumeVersion(currentProtocolVersion())); wr.serializeBytes(datacenterReplicasKeys.begin); wr << dcID; return wr.toValue(); @@ -705,7 +699,7 @@ const Value datacenterReplicasValue(int const& replicas) { } Optional decodeDatacenterReplicasKey(KeyRef const& key) { Optional dcID; - BinaryReader rd(key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion)); + BinaryReader rd(key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion())); rd >> dcID; return dcID; } @@ -721,27 +715,26 @@ extern const KeyRangeRef tLogDatacentersKeys; extern const KeyRef tLogDatacentersPrefix; const Key tLogDatacentersKeyFor(Optional dcID); -const KeyRangeRef tLogDatacentersKeys(LiteralStringRef("\xff\x02/tLogDatacenters/"), - LiteralStringRef("\xff\x02/tLogDatacenters0")); +const KeyRangeRef tLogDatacentersKeys("\xff\x02/tLogDatacenters/"_sr, "\xff\x02/tLogDatacenters0"_sr); const KeyRef tLogDatacentersPrefix = tLogDatacentersKeys.begin; const Key tLogDatacentersKeyFor(Optional dcID) { - BinaryWriter wr(AssumeVersion(currentProtocolVersion)); + BinaryWriter wr(AssumeVersion(currentProtocolVersion())); wr.serializeBytes(tLogDatacentersKeys.begin); wr << dcID; return wr.toValue(); } Optional decodeTLogDatacentersKey(KeyRef const& key) { Optional dcID; - BinaryReader rd(key.removePrefix(tLogDatacentersKeys.begin), AssumeVersion(currentProtocolVersion)); + BinaryReader rd(key.removePrefix(tLogDatacentersKeys.begin), AssumeVersion(currentProtocolVersion())); rd >> dcID; return dcID; } -const KeyRef primaryDatacenterKey = LiteralStringRef("\xff/primaryDatacenter"); +const KeyRef primaryDatacenterKey = "\xff/primaryDatacenter"_sr; // serverListKeys.contains(k) iff k.startsWith( serverListKeys.begin ) because '/'+1 == '0' -const KeyRangeRef serverListKeys(LiteralStringRef("\xff/serverList/"), LiteralStringRef("\xff/serverList0")); +const KeyRangeRef serverListKeys("\xff/serverList/"_sr, "\xff/serverList0"_sr); const KeyRef serverListPrefix = serverListKeys.begin; const Key serverListKeyFor(UID serverID) { @@ -752,7 +745,7 @@ const Key serverListKeyFor(UID serverID) { } const Value serverListValue(StorageServerInterface const& server) { - auto protocolVersion = currentProtocolVersion; + auto protocolVersion = currentProtocolVersion(); protocolVersion.addObjectSerializerFlag(); return ObjectWriter::toValue(server, IncludeVersion(protocolVersion)); } @@ -784,7 +777,7 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) { } Value swVersionValue(SWVersion const& swversion) { - auto protocolVersion = currentProtocolVersion; + auto protocolVersion = currentProtocolVersion(); protocolVersion.addObjectSerializerFlag(); return ObjectWriter::toValue(swversion, IncludeVersion(protocolVersion)); } @@ -797,11 +790,11 @@ SWVersion decodeSWVersionValue(ValueRef const& value) { } // processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0' -const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0")); +const KeyRangeRef processClassKeys("\xff/processClass/"_sr, "\xff/processClass0"_sr); const KeyRef processClassPrefix = processClassKeys.begin; -const KeyRef processClassChangeKey = LiteralStringRef("\xff/processClassChanges"); -const KeyRef processClassVersionKey = LiteralStringRef("\xff/processClassChangesVersion"); -const ValueRef processClassVersionValue = LiteralStringRef("1"); +const KeyRef processClassChangeKey = "\xff/processClassChanges"_sr; +const KeyRef processClassVersionKey = "\xff/processClassChangesVersion"_sr; +const ValueRef processClassVersionValue = "1"_sr; const Key processClassKeyFor(StringRef processID) { BinaryWriter wr(Unversioned()); @@ -837,21 +830,23 @@ ProcessClass decodeProcessClassValue(ValueRef const& value) { return s; } -const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\xff/conf0")); +const KeyRangeRef configKeys("\xff/conf/"_sr, "\xff/conf0"_sr); const KeyRef configKeysPrefix = configKeys.begin; -const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle")); -const KeyRef perpetualStorageWiggleLocalityKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle_locality")); -const KeyRef perpetualStorageWiggleIDPrefix( - LiteralStringRef("\xff/storageWiggleID/")); // withSuffix /primary or /remote -const KeyRef perpetualStorageWiggleStatsPrefix( - LiteralStringRef("\xff/storageWiggleStats/")); // withSuffix /primary or /remote +const KeyRef perpetualStorageWiggleKey("\xff/conf/perpetual_storage_wiggle"_sr); +const KeyRef perpetualStorageWiggleLocalityKey("\xff/conf/perpetual_storage_wiggle_locality"_sr); +const KeyRef perpetualStorageWiggleIDPrefix("\xff/storageWiggleID/"_sr); // withSuffix /primary or /remote +const KeyRef perpetualStorageWiggleStatsPrefix("\xff/storageWiggleStats/"_sr); // withSuffix /primary or /remote -const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint")); +const KeyRef triggerDDTeamInfoPrintKey("\xff/triggerDDTeamInfoPrint"_sr); -const KeyRangeRef excludedServersKeys(LiteralStringRef("\xff/conf/excluded/"), LiteralStringRef("\xff/conf/excluded0")); +const KeyRef consistencyScanInfoKey = "\xff/consistencyScanInfo"_sr; + +const KeyRef encryptionAtRestModeConfKey("\xff/conf/encryption_at_rest_mode"_sr); + +const KeyRangeRef excludedServersKeys("\xff/conf/excluded/"_sr, "\xff/conf/excluded0"_sr); const KeyRef excludedServersPrefix = excludedServersKeys.begin; -const KeyRef excludedServersVersionKey = LiteralStringRef("\xff/conf/excluded"); +const KeyRef excludedServersVersionKey = "\xff/conf/excluded"_sr; AddressExclusion decodeExcludedServersKey(KeyRef const& key) { ASSERT(key.startsWith(excludedServersPrefix)); // Returns an invalid NetworkAddress if given an invalid key (within the prefix) @@ -866,10 +861,9 @@ std::string encodeExcludedServersKey(AddressExclusion const& addr) { return excludedServersPrefix.toString() + addr.toString(); } -const KeyRangeRef excludedLocalityKeys(LiteralStringRef("\xff/conf/excluded_locality/"), - LiteralStringRef("\xff/conf/excluded_locality0")); +const KeyRangeRef excludedLocalityKeys("\xff/conf/excluded_locality/"_sr, "\xff/conf/excluded_locality0"_sr); const KeyRef excludedLocalityPrefix = excludedLocalityKeys.begin; -const KeyRef excludedLocalityVersionKey = LiteralStringRef("\xff/conf/excluded_locality"); +const KeyRef excludedLocalityVersionKey = "\xff/conf/excluded_locality"_sr; std::string decodeExcludedLocalityKey(KeyRef const& key) { ASSERT(key.startsWith(excludedLocalityPrefix)); return key.removePrefix(excludedLocalityPrefix).toString(); @@ -878,9 +872,9 @@ std::string encodeExcludedLocalityKey(std::string const& locality) { return excludedLocalityPrefix.toString() + locality; } -const KeyRangeRef failedServersKeys(LiteralStringRef("\xff/conf/failed/"), LiteralStringRef("\xff/conf/failed0")); +const KeyRangeRef failedServersKeys("\xff/conf/failed/"_sr, "\xff/conf/failed0"_sr); const KeyRef failedServersPrefix = failedServersKeys.begin; -const KeyRef failedServersVersionKey = LiteralStringRef("\xff/conf/failed"); +const KeyRef failedServersVersionKey = "\xff/conf/failed"_sr; AddressExclusion decodeFailedServersKey(KeyRef const& key) { ASSERT(key.startsWith(failedServersPrefix)); // Returns an invalid NetworkAddress if given an invalid key (within the prefix) @@ -895,10 +889,9 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) { return failedServersPrefix.toString() + addr.toString(); } -const KeyRangeRef failedLocalityKeys(LiteralStringRef("\xff/conf/failed_locality/"), - LiteralStringRef("\xff/conf/failed_locality0")); +const KeyRangeRef failedLocalityKeys("\xff/conf/failed_locality/"_sr, "\xff/conf/failed_locality0"_sr); const KeyRef failedLocalityPrefix = failedLocalityKeys.begin; -const KeyRef failedLocalityVersionKey = LiteralStringRef("\xff/conf/failed_locality"); +const KeyRef failedLocalityVersionKey = "\xff/conf/failed_locality"_sr; std::string decodeFailedLocalityKey(KeyRef const& key) { ASSERT(key.startsWith(failedLocalityPrefix)); return key.removePrefix(failedLocalityPrefix).toString(); @@ -907,20 +900,18 @@ std::string encodeFailedLocalityKey(std::string const& locality) { return failedLocalityPrefix.toString() + locality; } -// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") ); +// const KeyRangeRef globalConfigKeys( "\xff/globalConfig/"_sr, "\xff/globalConfig0"_sr ); // const KeyRef globalConfigPrefix = globalConfigKeys.begin; -const KeyRangeRef globalConfigDataKeys(LiteralStringRef("\xff/globalConfig/k/"), - LiteralStringRef("\xff/globalConfig/k0")); +const KeyRangeRef globalConfigDataKeys("\xff/globalConfig/k/"_sr, "\xff/globalConfig/k0"_sr); const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin; -const KeyRangeRef globalConfigHistoryKeys(LiteralStringRef("\xff/globalConfig/h/"), - LiteralStringRef("\xff/globalConfig/h0")); +const KeyRangeRef globalConfigHistoryKeys("\xff/globalConfig/h/"_sr, "\xff/globalConfig/h0"_sr); const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin; -const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v"); +const KeyRef globalConfigVersionKey = "\xff/globalConfig/v"_sr; -const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0")); +const KeyRangeRef workerListKeys("\xff/worker/"_sr, "\xff/worker0"_sr); const KeyRef workerListPrefix = workerListKeys.begin; const Key workerListKeyFor(StringRef processID) { @@ -950,11 +941,10 @@ ProcessData decodeWorkerListValue(ValueRef const& value) { return s; } -const KeyRangeRef backupProgressKeys(LiteralStringRef("\xff\x02/backupProgress/"), - LiteralStringRef("\xff\x02/backupProgress0")); +const KeyRangeRef backupProgressKeys("\xff\x02/backupProgress/"_sr, "\xff\x02/backupProgress0"_sr); const KeyRef backupProgressPrefix = backupProgressKeys.begin; -const KeyRef backupStartedKey = LiteralStringRef("\xff\x02/backupStarted"); -extern const KeyRef backupPausedKey = LiteralStringRef("\xff\x02/backupPaused"); +const KeyRef backupStartedKey = "\xff\x02/backupStarted"_sr; +extern const KeyRef backupPausedKey = "\xff\x02/backupPaused"_sr; const Key backupProgressKeyFor(UID workerID) { BinaryWriter wr(Unversioned()); @@ -997,98 +987,91 @@ std::vector> decodeBackupStartedValue(const ValueRef& va return ids; } -const KeyRef coordinatorsKey = LiteralStringRef("\xff/coordinators"); -const KeyRef logsKey = LiteralStringRef("\xff/logs"); -const KeyRef minRequiredCommitVersionKey = LiteralStringRef("\xff/minRequiredCommitVersion"); -const KeyRef versionEpochKey = LiteralStringRef("\xff/versionEpoch"); +const KeyRef previousCoordinatorsKey = "\xff/previousCoordinators"_sr; +const KeyRef coordinatorsKey = "\xff/coordinators"_sr; +const KeyRef logsKey = "\xff/logs"_sr; +const KeyRef minRequiredCommitVersionKey = "\xff/minRequiredCommitVersion"_sr; +const KeyRef versionEpochKey = "\xff/versionEpoch"_sr; -const KeyRef globalKeysPrefix = LiteralStringRef("\xff/globals"); -const KeyRef lastEpochEndKey = LiteralStringRef("\xff/globals/lastEpochEnd"); -const KeyRef lastEpochEndPrivateKey = LiteralStringRef("\xff\xff/globals/lastEpochEnd"); -const KeyRef killStorageKey = LiteralStringRef("\xff/globals/killStorage"); -const KeyRef killStoragePrivateKey = LiteralStringRef("\xff\xff/globals/killStorage"); -const KeyRef rebootWhenDurableKey = LiteralStringRef("\xff/globals/rebootWhenDurable"); -const KeyRef rebootWhenDurablePrivateKey = LiteralStringRef("\xff\xff/globals/rebootWhenDurable"); -const KeyRef primaryLocalityKey = LiteralStringRef("\xff/globals/primaryLocality"); -const KeyRef primaryLocalityPrivateKey = LiteralStringRef("\xff\xff/globals/primaryLocality"); -const KeyRef fastLoggingEnabled = LiteralStringRef("\xff/globals/fastLoggingEnabled"); -const KeyRef fastLoggingEnabledPrivateKey = LiteralStringRef("\xff\xff/globals/fastLoggingEnabled"); +const KeyRef globalKeysPrefix = "\xff/globals"_sr; +const KeyRef lastEpochEndKey = "\xff/globals/lastEpochEnd"_sr; +const KeyRef lastEpochEndPrivateKey = "\xff\xff/globals/lastEpochEnd"_sr; +const KeyRef killStorageKey = "\xff/globals/killStorage"_sr; +const KeyRef killStoragePrivateKey = "\xff\xff/globals/killStorage"_sr; +const KeyRef rebootWhenDurableKey = "\xff/globals/rebootWhenDurable"_sr; +const KeyRef rebootWhenDurablePrivateKey = "\xff\xff/globals/rebootWhenDurable"_sr; +const KeyRef primaryLocalityKey = "\xff/globals/primaryLocality"_sr; +const KeyRef primaryLocalityPrivateKey = "\xff\xff/globals/primaryLocality"_sr; +const KeyRef fastLoggingEnabled = "\xff/globals/fastLoggingEnabled"_sr; +const KeyRef fastLoggingEnabledPrivateKey = "\xff\xff/globals/fastLoggingEnabled"_sr; // Whenever configuration changes or DD related system keyspace is changed(e.g.., serverList), // actor must grab the moveKeysLockOwnerKey and update moveKeysLockWriteKey. // This prevents concurrent write to the same system keyspace. // When the owner of the DD related system keyspace changes, DD will reboot -const KeyRef moveKeysLockOwnerKey = LiteralStringRef("\xff/moveKeysLock/Owner"); -const KeyRef moveKeysLockWriteKey = LiteralStringRef("\xff/moveKeysLock/Write"); +const KeyRef moveKeysLockOwnerKey = "\xff/moveKeysLock/Owner"_sr; +const KeyRef moveKeysLockWriteKey = "\xff/moveKeysLock/Write"_sr; -const KeyRef dataDistributionModeKey = LiteralStringRef("\xff/dataDistributionMode"); +const KeyRef dataDistributionModeKey = "\xff/dataDistributionMode"_sr; const UID dataDistributionModeLock = UID(6345, 3425); // Keys to view and control tag throttling -const KeyRangeRef tagThrottleKeys = - KeyRangeRef(LiteralStringRef("\xff\x02/throttledTags/tag/"), LiteralStringRef("\xff\x02/throttledTags/tag0")); +const KeyRangeRef tagThrottleKeys = KeyRangeRef("\xff\x02/throttledTags/tag/"_sr, "\xff\x02/throttledTags/tag0"_sr); const KeyRef tagThrottleKeysPrefix = tagThrottleKeys.begin; -const KeyRef tagThrottleAutoKeysPrefix = LiteralStringRef("\xff\x02/throttledTags/tag/\x01"); -const KeyRef tagThrottleSignalKey = LiteralStringRef("\xff\x02/throttledTags/signal"); -const KeyRef tagThrottleAutoEnabledKey = LiteralStringRef("\xff\x02/throttledTags/autoThrottlingEnabled"); -const KeyRef tagThrottleLimitKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleLimit"); -const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleCount"); +const KeyRef tagThrottleAutoKeysPrefix = "\xff\x02/throttledTags/tag/\x01"_sr; +const KeyRef tagThrottleSignalKey = "\xff\x02/throttledTags/signal"_sr; +const KeyRef tagThrottleAutoEnabledKey = "\xff\x02/throttledTags/autoThrottlingEnabled"_sr; +const KeyRef tagThrottleLimitKey = "\xff\x02/throttledTags/manualThrottleLimit"_sr; +const KeyRef tagThrottleCountKey = "\xff\x02/throttledTags/manualThrottleCount"_sr; // Client status info prefix -const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"), - LiteralStringRef("\xff\x02/fdbClientInfo0")); +const KeyRangeRef fdbClientInfoPrefixRange("\xff\x02/fdbClientInfo/"_sr, "\xff\x02/fdbClientInfo0"_sr); // See remaining fields in GlobalConfig.actor.h // ConsistencyCheck settings -const KeyRef fdbShouldConsistencyCheckBeSuspended = LiteralStringRef("\xff\x02/ConsistencyCheck/Suspend"); +const KeyRef fdbShouldConsistencyCheckBeSuspended = "\xff\x02/ConsistencyCheck/Suspend"_sr; // Request latency measurement key -const KeyRef latencyBandConfigKey = LiteralStringRef("\xff\x02/latencyBandConfig"); +const KeyRef latencyBandConfigKey = "\xff\x02/latencyBandConfig"_sr; // Keyspace to maintain wall clock to version map -const KeyRangeRef timeKeeperPrefixRange(LiteralStringRef("\xff\x02/timeKeeper/map/"), - LiteralStringRef("\xff\x02/timeKeeper/map0")); -const KeyRef timeKeeperVersionKey = LiteralStringRef("\xff\x02/timeKeeper/version"); -const KeyRef timeKeeperDisableKey = LiteralStringRef("\xff\x02/timeKeeper/disable"); +const KeyRangeRef timeKeeperPrefixRange("\xff\x02/timeKeeper/map/"_sr, "\xff\x02/timeKeeper/map0"_sr); +const KeyRef timeKeeperVersionKey = "\xff\x02/timeKeeper/version"_sr; +const KeyRef timeKeeperDisableKey = "\xff\x02/timeKeeper/disable"_sr; // Backup Log Mutation constant variables -const KeyRef backupEnabledKey = LiteralStringRef("\xff/backupEnabled"); -const KeyRangeRef backupLogKeys(LiteralStringRef("\xff\x02/blog/"), LiteralStringRef("\xff\x02/blog0")); -const KeyRangeRef applyLogKeys(LiteralStringRef("\xff\x02/alog/"), LiteralStringRef("\xff\x02/alog0")); +const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr; +const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr); +const KeyRangeRef applyLogKeys("\xff\x02/alog/"_sr, "\xff\x02/alog0"_sr); // static_assert( backupLogKeys.begin.size() == backupLogPrefixBytes, "backupLogPrefixBytes incorrect" ); -const KeyRef backupVersionKey = LiteralStringRef("\xff/backupDataFormat"); -const ValueRef backupVersionValue = LiteralStringRef("4"); +const KeyRef backupVersionKey = "\xff/backupDataFormat"_sr; +const ValueRef backupVersionValue = "4"_sr; const int backupVersion = 4; // Log Range constant variables // \xff/logRanges/[16-byte UID][begin key] := serialize( make_pair([end key], [destination key prefix]), // IncludeVersion() ) -const KeyRangeRef logRangesRange(LiteralStringRef("\xff/logRanges/"), LiteralStringRef("\xff/logRanges0")); +const KeyRangeRef logRangesRange("\xff/logRanges/"_sr, "\xff/logRanges0"_sr); // Layer status metadata prefix -const KeyRangeRef layerStatusMetaPrefixRange(LiteralStringRef("\xff\x02/status/"), - LiteralStringRef("\xff\x02/status0")); +const KeyRangeRef layerStatusMetaPrefixRange("\xff\x02/status/"_sr, "\xff\x02/status0"_sr); // Backup agent status root -const KeyRangeRef backupStatusPrefixRange(LiteralStringRef("\xff\x02/backupstatus/"), - LiteralStringRef("\xff\x02/backupstatus0")); +const KeyRangeRef backupStatusPrefixRange("\xff\x02/backupstatus/"_sr, "\xff\x02/backupstatus0"_sr); // Restore configuration constant variables -const KeyRangeRef fileRestorePrefixRange(LiteralStringRef("\xff\x02/restore-agent/"), - LiteralStringRef("\xff\x02/restore-agent0")); +const KeyRangeRef fileRestorePrefixRange("\xff\x02/restore-agent/"_sr, "\xff\x02/restore-agent0"_sr); // Backup Agent configuration constant variables -const KeyRangeRef fileBackupPrefixRange(LiteralStringRef("\xff\x02/backup-agent/"), - LiteralStringRef("\xff\x02/backup-agent0")); +const KeyRangeRef fileBackupPrefixRange("\xff\x02/backup-agent/"_sr, "\xff\x02/backup-agent0"_sr); // DR Agent configuration constant variables -const KeyRangeRef databaseBackupPrefixRange(LiteralStringRef("\xff\x02/db-backup-agent/"), - LiteralStringRef("\xff\x02/db-backup-agent0")); +const KeyRangeRef databaseBackupPrefixRange("\xff\x02/db-backup-agent/"_sr, "\xff\x02/db-backup-agent0"_sr); // \xff\x02/sharedLogRangesConfig/destUidLookup/[keyRange] -const KeyRef destUidLookupPrefix = LiteralStringRef("\xff\x02/sharedLogRangesConfig/destUidLookup/"); +const KeyRef destUidLookupPrefix = "\xff\x02/sharedLogRangesConfig/destUidLookup/"_sr; // \xff\x02/sharedLogRangesConfig/backuplatestVersions/[destUid]/[logUid] -const KeyRef backupLatestVersionsPrefix = LiteralStringRef("\xff\x02/sharedLogRangesConfig/backupLatestVersions/"); +const KeyRef backupLatestVersionsPrefix = "\xff\x02/sharedLogRangesConfig/backupLatestVersions/"_sr; // Returns the encoded key comprised of begin key and log uid Key logRangesEncodeKey(KeyRef keyBegin, UID logUid) { @@ -1143,31 +1126,27 @@ Key uidPrefixKey(KeyRef keyPrefix, UID logUid) { // Apply mutations constant variables // \xff/applyMutationsEnd/[16-byte UID] := serialize( endVersion, Unversioned() ) // This indicates what is the highest version the mutation log can be applied -const KeyRangeRef applyMutationsEndRange(LiteralStringRef("\xff/applyMutationsEnd/"), - LiteralStringRef("\xff/applyMutationsEnd0")); +const KeyRangeRef applyMutationsEndRange("\xff/applyMutationsEnd/"_sr, "\xff/applyMutationsEnd0"_sr); // \xff/applyMutationsBegin/[16-byte UID] := serialize( beginVersion, Unversioned() ) -const KeyRangeRef applyMutationsBeginRange(LiteralStringRef("\xff/applyMutationsBegin/"), - LiteralStringRef("\xff/applyMutationsBegin0")); +const KeyRangeRef applyMutationsBeginRange("\xff/applyMutationsBegin/"_sr, "\xff/applyMutationsBegin0"_sr); // \xff/applyMutationsAddPrefix/[16-byte UID] := addPrefix -const KeyRangeRef applyMutationsAddPrefixRange(LiteralStringRef("\xff/applyMutationsAddPrefix/"), - LiteralStringRef("\xff/applyMutationsAddPrefix0")); +const KeyRangeRef applyMutationsAddPrefixRange("\xff/applyMutationsAddPrefix/"_sr, "\xff/applyMutationsAddPrefix0"_sr); // \xff/applyMutationsRemovePrefix/[16-byte UID] := removePrefix -const KeyRangeRef applyMutationsRemovePrefixRange(LiteralStringRef("\xff/applyMutationsRemovePrefix/"), - LiteralStringRef("\xff/applyMutationsRemovePrefix0")); +const KeyRangeRef applyMutationsRemovePrefixRange("\xff/applyMutationsRemovePrefix/"_sr, + "\xff/applyMutationsRemovePrefix0"_sr); -const KeyRangeRef applyMutationsKeyVersionMapRange(LiteralStringRef("\xff/applyMutationsKeyVersionMap/"), - LiteralStringRef("\xff/applyMutationsKeyVersionMap0")); -const KeyRangeRef applyMutationsKeyVersionCountRange(LiteralStringRef("\xff\x02/applyMutationsKeyVersionCount/"), - LiteralStringRef("\xff\x02/applyMutationsKeyVersionCount0")); +const KeyRangeRef applyMutationsKeyVersionMapRange("\xff/applyMutationsKeyVersionMap/"_sr, + "\xff/applyMutationsKeyVersionMap0"_sr); +const KeyRangeRef applyMutationsKeyVersionCountRange("\xff\x02/applyMutationsKeyVersionCount/"_sr, + "\xff\x02/applyMutationsKeyVersionCount0"_sr); -const KeyRef systemTuplesPrefix = LiteralStringRef("\xff/a/"); -const KeyRef metricConfChangeKey = LiteralStringRef("\x01TDMetricConfChanges\x00"); +const KeyRef systemTuplesPrefix = "\xff/a/"_sr; +const KeyRef metricConfChangeKey = "\x01TDMetricConfChanges\x00"_sr; -const KeyRangeRef metricConfKeys(LiteralStringRef("\x01TDMetricConf\x00\x01"), - LiteralStringRef("\x01TDMetricConf\x00\x02")); +const KeyRangeRef metricConfKeys("\x01TDMetricConf\x00\x01"_sr, "\x01TDMetricConf\x00\x02"_sr); const KeyRef metricConfPrefix = metricConfKeys.begin; /* @@ -1176,15 +1155,15 @@ const Key metricConfKey( KeyRef const& prefix, MetricNameRef const& name, KeyRef wr.serializeBytes( prefix ); wr.serializeBytes( metricConfPrefix ); wr.serializeBytes( name.type ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( name.name ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( name.address ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( name.id ); - wr.serializeBytes( LiteralStringRef("\x00\x01") ); + wr.serializeBytes( "\x00\x01"_sr ); wr.serializeBytes( key ); - wr.serializeBytes( LiteralStringRef("\x00") ); + wr.serializeBytes( "\x00"_sr ); return wr.toValue(); } @@ -1207,23 +1186,22 @@ std::pair decodeMetricConfKey( KeyRef const& prefix, KeyR } */ -const KeyRef maxUIDKey = LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"); +const KeyRef maxUIDKey = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"_sr; -const KeyRef databaseLockedKey = LiteralStringRef("\xff/dbLocked"); -const KeyRef databaseLockedKeyEnd = LiteralStringRef("\xff/dbLocked\x00"); -const KeyRef metadataVersionKey = LiteralStringRef("\xff/metadataVersion"); -const KeyRef metadataVersionKeyEnd = LiteralStringRef("\xff/metadataVersion\x00"); -const KeyRef metadataVersionRequiredValue = - LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"); -const KeyRef mustContainSystemMutationsKey = LiteralStringRef("\xff/mustContainSystemMutations"); +const KeyRef databaseLockedKey = "\xff/dbLocked"_sr; +const KeyRef databaseLockedKeyEnd = "\xff/dbLocked\x00"_sr; +const KeyRef metadataVersionKey = "\xff/metadataVersion"_sr; +const KeyRef metadataVersionKeyEnd = "\xff/metadataVersion\x00"_sr; +const KeyRef metadataVersionRequiredValue = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"_sr; +const KeyRef mustContainSystemMutationsKey = "\xff/mustContainSystemMutations"_sr; -const KeyRangeRef monitorConfKeys(LiteralStringRef("\xff\x02/monitorConf/"), LiteralStringRef("\xff\x02/monitorConf0")); +const KeyRangeRef monitorConfKeys("\xff\x02/monitorConf/"_sr, "\xff\x02/monitorConf0"_sr); -const KeyRef restoreRequestDoneKey = LiteralStringRef("\xff\x02/restoreRequestDone"); +const KeyRef restoreRequestDoneKey = "\xff\x02/restoreRequestDone"_sr; -const KeyRef healthyZoneKey = LiteralStringRef("\xff\x02/healthyZone"); -const StringRef ignoreSSFailuresZoneString = LiteralStringRef("IgnoreSSFailures"); -const KeyRef rebalanceDDIgnoreKey = LiteralStringRef("\xff\x02/rebalanceDDIgnored"); +const KeyRef healthyZoneKey = "\xff\x02/healthyZone"_sr; +const StringRef ignoreSSFailuresZoneString = "IgnoreSSFailures"_sr; +const KeyRef rebalanceDDIgnoreKey = "\xff\x02/rebalanceDDIgnored"_sr; const Value healthyZoneValue(StringRef const& zoneId, Version version) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withHealthyZoneValue())); @@ -1240,16 +1218,15 @@ std::pair decodeHealthyZoneValue(ValueRef const& value) { return std::make_pair(zoneId, version); } -const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTONLYtxnStateStore/"), - LiteralStringRef("\xff/TESTONLYtxnStateStore0")); +const KeyRangeRef testOnlyTxnStateStorePrefixRange("\xff/TESTONLYtxnStateStore/"_sr, "\xff/TESTONLYtxnStateStore0"_sr); -const KeyRef writeRecoveryKey = LiteralStringRef("\xff/writeRecovery"); -const ValueRef writeRecoveryKeyTrue = LiteralStringRef("1"); -const KeyRef snapshotEndVersionKey = LiteralStringRef("\xff/snapshotEndVersion"); +const KeyRef writeRecoveryKey = "\xff/writeRecovery"_sr; +const ValueRef writeRecoveryKeyTrue = "1"_sr; +const KeyRef snapshotEndVersionKey = "\xff/snapshotEndVersion"_sr; -const KeyRangeRef changeFeedKeys(LiteralStringRef("\xff\x02/feed/"), LiteralStringRef("\xff\x02/feed0")); +const KeyRangeRef changeFeedKeys("\xff\x02/feed/"_sr, "\xff\x02/feed0"_sr); const KeyRef changeFeedPrefix = changeFeedKeys.begin; -const KeyRef changeFeedPrivatePrefix = LiteralStringRef("\xff\xff\x02/feed/"); +const KeyRef changeFeedPrivatePrefix = "\xff\xff\x02/feed/"_sr; const Value changeFeedValue(KeyRangeRef const& range, Version popVersion, ChangeFeedStatus status) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withChangeFeed())); @@ -1270,7 +1247,7 @@ std::tuple decodeChangeFeedValue(ValueRef c return std::make_tuple(range, version, status); } -const KeyRangeRef changeFeedDurableKeys(LiteralStringRef("\xff\xff/cf/"), LiteralStringRef("\xff\xff/cf0")); +const KeyRangeRef changeFeedDurableKeys("\xff\xff/cf/"_sr, "\xff\xff/cf0"_sr); const KeyRef changeFeedDurablePrefix = changeFeedDurableKeys.begin; const Value changeFeedDurableKey(Key const& feed, Version version) { @@ -1310,9 +1287,9 @@ const KeyRangeRef configClassKeys("\xff\xff/configClasses/"_sr, "\xff\xff/config // key to watch for changes in active blob ranges + KeyRangeMap of active blob ranges // Blob Manager + Worker stuff is all \xff\x02 to avoid Transaction State Store -const KeyRef blobRangeChangeKey = LiteralStringRef("\xff\x02/blobRangeChange"); -const KeyRangeRef blobRangeKeys(LiteralStringRef("\xff\x02/blobRange/"), LiteralStringRef("\xff\x02/blobRange0")); -const KeyRef blobManagerEpochKey = LiteralStringRef("\xff\x02/blobManagerEpoch"); +const KeyRef blobRangeChangeKey = "\xff\x02/blobRangeChange"_sr; +const KeyRangeRef blobRangeKeys("\xff\x02/blobRange/"_sr, "\xff\x02/blobRange0"_sr); +const KeyRef blobManagerEpochKey = "\xff\x02/blobManagerEpoch"_sr; const Value blobManagerEpochValueFor(int64_t epoch) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule())); @@ -1328,15 +1305,19 @@ int64_t decodeBlobManagerEpochValue(ValueRef const& value) { } // blob granule data -const KeyRangeRef blobGranuleFileKeys(LiteralStringRef("\xff\x02/bgf/"), LiteralStringRef("\xff\x02/bgf0")); -const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), LiteralStringRef("\xff\x02/bgm0")); -const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0")); -const KeyRangeRef blobGranuleSplitKeys(LiteralStringRef("\xff\x02/bgs/"), LiteralStringRef("\xff\x02/bgs0")); -const KeyRangeRef blobGranuleMergeKeys(LiteralStringRef("\xff\x02/bgmerge/"), LiteralStringRef("\xff\x02/bgmerge0")); -const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0")); -const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0")); -const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0")); -const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange"); +const KeyRef blobRangeActive = "1"_sr; +const KeyRef blobRangeInactive = StringRef(); + +const KeyRangeRef blobGranuleFileKeys("\xff\x02/bgf/"_sr, "\xff\x02/bgf0"_sr); +const KeyRangeRef blobGranuleMappingKeys("\xff\x02/bgm/"_sr, "\xff\x02/bgm0"_sr); +const KeyRangeRef blobGranuleLockKeys("\xff\x02/bgl/"_sr, "\xff\x02/bgl0"_sr); +const KeyRangeRef blobGranuleSplitKeys("\xff\x02/bgs/"_sr, "\xff\x02/bgs0"_sr); +const KeyRangeRef blobGranuleMergeKeys("\xff\x02/bgmerge/"_sr, "\xff\x02/bgmerge0"_sr); +const KeyRangeRef blobGranuleMergeBoundaryKeys("\xff\x02/bgmergebounds/"_sr, "\xff\x02/bgmergebounds0"_sr); +const KeyRangeRef blobGranuleHistoryKeys("\xff\x02/bgh/"_sr, "\xff\x02/bgh0"_sr); +const KeyRangeRef blobGranulePurgeKeys("\xff\x02/bgp/"_sr, "\xff\x02/bgp0"_sr); +const KeyRangeRef blobGranuleForcePurgedKeys("\xff\x02/bgpforce/"_sr, "\xff\x02/bgpforce0"_sr); +const KeyRef blobGranulePurgeChangeKey = "\xff\x02/bgpChange"_sr; const uint8_t BG_FILE_TYPE_DELTA = 'D'; const uint8_t BG_FILE_TYPE_SNAPSHOT = 'S'; @@ -1524,9 +1505,9 @@ std::pair decodeBlobGranuleSplitValue(const Valu const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange, std::vector parentGranuleIDs, - std::vector parentGranuleRanges, + std::vector parentGranuleRanges, std::vector parentGranuleStartVersions) { - ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size()); + ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size() - 1); ASSERT(parentGranuleIDs.size() == parentGranuleStartVersions.size()); BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule())); @@ -1536,12 +1517,12 @@ const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange, wr << parentGranuleStartVersions; return addVersionStampAtEnd(wr.toValue()); } -std::tuple, std::vector, std::vector> -decodeBlobGranuleMergeValue(ValueRef const& value) { +std::tuple, std::vector, std::vector> decodeBlobGranuleMergeValue( + ValueRef const& value) { KeyRange range; Version v; std::vector parentGranuleIDs; - std::vector parentGranuleRanges; + std::vector parentGranuleRanges; std::vector parentGranuleStartVersions; BinaryReader reader(value, IncludeVersion()); @@ -1551,13 +1532,30 @@ decodeBlobGranuleMergeValue(ValueRef const& value) { reader >> parentGranuleStartVersions; reader >> v; - ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size()); + ASSERT(parentGranuleIDs.size() == parentGranuleRanges.size() - 1); ASSERT(parentGranuleIDs.size() == parentGranuleStartVersions.size()); ASSERT(bigEndian64(v) >= 0); return std::tuple(range, bigEndian64(v), parentGranuleIDs, parentGranuleRanges, parentGranuleStartVersions); } +const Key blobGranuleMergeBoundaryKeyFor(const KeyRef& key) { + return key.withPrefix(blobGranuleMergeBoundaryKeys.begin); +} + +const Value blobGranuleMergeBoundaryValueFor(BlobGranuleMergeBoundary const& boundary) { + BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule())); + wr << boundary; + return wr.toValue(); +} + +Standalone decodeBlobGranuleMergeBoundaryValue(const ValueRef& value) { + Standalone boundaryValue; + BinaryReader reader(value, IncludeVersion()); + reader >> boundaryValue; + return boundaryValue; +} + const Key blobGranuleHistoryKeyFor(KeyRangeRef const& range, Version version) { BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule())); wr.serializeBytes(blobGranuleHistoryKeys.begin); @@ -1581,6 +1579,8 @@ const KeyRange blobGranuleHistoryKeyRangeFor(KeyRangeRef const& range) { } const Value blobGranuleHistoryValueFor(Standalone const& historyValue) { + ASSERT(historyValue.parentVersions.empty() || + historyValue.parentBoundaries.size() - 1 == historyValue.parentVersions.size()); BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule())); wr << historyValue; return wr.toValue(); @@ -1590,10 +1590,12 @@ Standalone decodeBlobGranuleHistoryValue(const ValueRef Standalone historyValue; BinaryReader reader(value, IncludeVersion()); reader >> historyValue; + ASSERT(historyValue.parentVersions.empty() || + historyValue.parentBoundaries.size() - 1 == historyValue.parentVersions.size()); return historyValue; } -const KeyRangeRef blobWorkerListKeys(LiteralStringRef("\xff\x02/bwList/"), LiteralStringRef("\xff\x02/bwList0")); +const KeyRangeRef blobWorkerListKeys("\xff\x02/bwList/"_sr, "\xff\x02/bwList0"_sr); const Key blobWorkerListKeyFor(UID workerID) { BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule())); @@ -1620,11 +1622,12 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) { return interf; } -const KeyRangeRef tenantMapKeys("\xff/tenantMap/"_sr, "\xff/tenantMap0"_sr); -const KeyRef tenantMapPrefix = tenantMapKeys.begin; -const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr; -const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr; -const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr; +const KeyRangeRef storageQuotaKeys("\xff/storageQuota/"_sr, "\xff/storageQuota0"_sr); +const KeyRef storageQuotaPrefix = storageQuotaKeys.begin; + +Key storageQuotaKey(StringRef tenantName) { + return tenantName.withPrefix(storageQuotaPrefix); +} // for tests void testSSISerdes(StorageServerInterface const& ssi) { diff --git a/fdbclient/TagThrottle.actor.cpp b/fdbclient/TagThrottle.actor.cpp index b5205fd153..7a1712c4df 100644 --- a/fdbclient/TagThrottle.actor.cpp +++ b/fdbclient/TagThrottle.actor.cpp @@ -136,17 +136,11 @@ Key ThrottleApi::getTagQuotaKey(TransactionTagRef tag) { } bool ThrottleApi::TagQuotaValue::isValid() const { - return reservedReadQuota <= totalReadQuota && reservedWriteQuota <= totalWriteQuota && reservedReadQuota >= 0 && - reservedWriteQuota >= 0; + return reservedQuota <= totalQuota && reservedQuota >= 0; } Value ThrottleApi::TagQuotaValue::toValue() const { - Tuple tuple; - tuple.appendDouble(reservedReadQuota); - tuple.appendDouble(totalReadQuota); - tuple.appendDouble(reservedWriteQuota); - tuple.appendDouble(totalWriteQuota); - return tuple.pack(); + return Tuple::makeTuple(reservedQuota, totalQuota).pack(); } ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) { @@ -156,20 +150,16 @@ ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) } TagQuotaValue result; try { - result.reservedReadQuota = tuple.getDouble(0); - result.totalReadQuota = tuple.getDouble(1); - result.reservedWriteQuota = tuple.getDouble(2); - result.totalWriteQuota = tuple.getDouble(3); + result.reservedQuota = tuple.getDouble(0); + result.totalQuota = tuple.getDouble(1); } catch (Error& e) { TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e); throw invalid_throttle_quota_value(); } if (!result.isValid()) { TraceEvent(SevWarnAlways, "TagQuotaValueInvalidQuotas") - .detail("ReservedReadQuota", result.reservedReadQuota) - .detail("TotalReadQuota", result.totalReadQuota) - .detail("ReservedWriteQuota", result.reservedWriteQuota) - .detail("TotalWriteQuota", result.totalWriteQuota); + .detail("ReservedQuota", result.reservedQuota) + .detail("TotalQuota", result.totalQuota); throw invalid_throttle_quota_value(); } return result; diff --git a/fdbclient/TaskBucket.actor.cpp b/fdbclient/TaskBucket.actor.cpp index aa9a110086..2e72b301c0 100644 --- a/fdbclient/TaskBucket.actor.cpp +++ b/fdbclient/TaskBucket.actor.cpp @@ -66,7 +66,7 @@ struct UnblockFutureTaskFunc : TaskFuncBase { return Void(); } }; -StringRef UnblockFutureTaskFunc::name = LiteralStringRef("UnblockFuture"); +StringRef UnblockFutureTaskFunc::name = "UnblockFuture"_sr; REGISTER_TASKFUNC(UnblockFutureTaskFunc); struct AddTaskFunc : TaskFuncBase { @@ -88,7 +88,7 @@ struct AddTaskFunc : TaskFuncBase { return Void(); }; }; -StringRef AddTaskFunc::name = LiteralStringRef("AddTask"); +StringRef AddTaskFunc::name = "AddTask"_sr; REGISTER_TASKFUNC(AddTaskFunc); struct IdleTaskFunc : TaskFuncBase { @@ -109,18 +109,18 @@ struct IdleTaskFunc : TaskFuncBase { return tb->finish(tr, task); }; }; -StringRef IdleTaskFunc::name = LiteralStringRef("idle"); +StringRef IdleTaskFunc::name = "idle"_sr; REGISTER_TASKFUNC(IdleTaskFunc); -Key Task::reservedTaskParamKeyType = LiteralStringRef("type"); -Key Task::reservedTaskParamKeyAddTask = LiteralStringRef("_add_task"); -Key Task::reservedTaskParamKeyDone = LiteralStringRef("done"); -Key Task::reservedTaskParamKeyPriority = LiteralStringRef("priority"); -Key Task::reservedTaskParamKeyFuture = LiteralStringRef("future"); -Key Task::reservedTaskParamKeyBlockID = LiteralStringRef("blockid"); -Key Task::reservedTaskParamKeyVersion = LiteralStringRef("version"); -Key Task::reservedTaskParamValidKey = LiteralStringRef("_validkey"); -Key Task::reservedTaskParamValidValue = LiteralStringRef("_validvalue"); +Key Task::reservedTaskParamKeyType = "type"_sr; +Key Task::reservedTaskParamKeyAddTask = "_add_task"_sr; +Key Task::reservedTaskParamKeyDone = "done"_sr; +Key Task::reservedTaskParamKeyPriority = "priority"_sr; +Key Task::reservedTaskParamKeyFuture = "future"_sr; +Key Task::reservedTaskParamKeyBlockID = "blockid"_sr; +Key Task::reservedTaskParamKeyVersion = "version"_sr; +Key Task::reservedTaskParamValidKey = "_validkey"_sr; +Key Task::reservedTaskParamValidValue = "_validvalue"_sr; // IMPORTANT: Task() must result in an EMPTY parameter set, so params should only // be set for non-default constructor arguments. To change this behavior look at all @@ -199,7 +199,7 @@ public: // many other new tasks get added so that the timed out tasks never get chances to re-run if (deterministicRandom()->random01() < CLIENT_KNOBS->TASKBUCKET_CHECK_TIMEOUT_CHANCE) { bool anyTimeouts = wait(requeueTimedOutTasks(tr, taskBucket)); - TEST(anyTimeouts); // Found a task that timed out + CODE_PROBE(anyTimeouts, "Found a task that timed out"); } state std::vector>> taskKeyFutures(CLIENT_KNOBS->TASKBUCKET_MAX_PRIORITY + 1); @@ -233,7 +233,7 @@ public: bool anyTimeouts = wait(requeueTimedOutTasks(tr, taskBucket)); // If there were timeouts, try to get a task since there should now be one in one of the available spaces. if (anyTimeouts) { - TEST(true); // Try to get one task from timeouts subspace + CODE_PROBE(true, "Try to get one task from timeouts subspace"); Reference task = wait(getOne(tr, taskBucket)); return task; } @@ -651,10 +651,7 @@ public: Reference task) { taskBucket->setOptions(tr); - Tuple t; - t.append(task->timeoutVersion); - t.append(task->key); - + Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key); RangeResult values = wait(tr->getRange(taskBucket->timeouts.range(t), 1)); if (values.size() > 0) return false; @@ -707,7 +704,7 @@ public: wait(delay(CLIENT_KNOBS->TASKBUCKET_CHECK_ACTIVE_DELAY)); bool isActiveKey = wait(getActiveKey(tr, taskBucket, startingValue)); if (isActiveKey) { - TEST(true); // checkActive return true + CODE_PROBE(true, "checkActive return true"); return true; } break; @@ -717,7 +714,7 @@ public: } } - TEST(true); // checkActive return false + CODE_PROBE(true, "checkActive return false"); return false; } @@ -725,7 +722,7 @@ public: Reference taskBucket) { taskBucket->setOptions(tr); - Optional val = wait(tr->get(taskBucket->prefix.pack(LiteralStringRef("task_count")))); + Optional val = wait(tr->get(taskBucket->prefix.pack("task_count"_sr))); if (!val.present()) return 0; @@ -742,7 +739,7 @@ public: // Returns True if any tasks were affected. ACTOR static Future requeueTimedOutTasks(Reference tr, Reference taskBucket) { - TEST(true); // Looks for tasks that have timed out and returns them to be available tasks. + CODE_PROBE(true, "Looks for tasks that have timed out and returns them to be available tasks."); Version end = wait(tr->getReadVersion()); state KeyRange range( KeyRangeRef(taskBucket->timeouts.get(0).range().begin, taskBucket->timeouts.get(end).range().end)); @@ -849,12 +846,12 @@ public: // If we're updating the task params the clear the old space and write params to the new space if (updateParams) { - TEST(true); // Extended a task while updating parameters + CODE_PROBE(true, "Extended a task while updating parameters"); for (auto& p : task->params) { tr->set(newTimeoutSpace.pack(p.key), p.value); } } else { - TEST(true); // Extended a task without updating parameters + CODE_PROBE(true, "Extended a task without updating parameters"); // Otherwise, read and transplant the params from the old to new timeout spaces RangeResult params = wait(tr->getRange(oldTimeoutSpace.range(), CLIENT_KNOBS->TOO_MANY)); for (auto& kv : params) { @@ -876,10 +873,10 @@ TaskBucket::TaskBucket(const Subspace& subspace, : cc("TaskBucket"), dispatchSlotChecksStarted("DispatchSlotChecksStarted", cc), dispatchErrors("DispatchErrors", cc), dispatchDoTasks("DispatchDoTasks", cc), dispatchEmptyTasks("DispatchEmptyTasks", cc), dispatchSlotChecksComplete("DispatchSlotChecksComplete", cc), dbgid(deterministicRandom()->randomUniqueID()), - prefix(subspace), active(prefix.get(LiteralStringRef("ac"))), pauseKey(prefix.pack(LiteralStringRef("pause"))), - available(prefix.get(LiteralStringRef("av"))), available_prioritized(prefix.get(LiteralStringRef("avp"))), - timeouts(prefix.get(LiteralStringRef("to"))), timeout(CLIENT_KNOBS->TASKBUCKET_TIMEOUT_VERSIONS), - system_access(sysAccess), priority_batch(priorityBatch), lockAware(lockAware) {} + prefix(subspace), active(prefix.get("ac"_sr)), pauseKey(prefix.pack("pause"_sr)), available(prefix.get("av"_sr)), + available_prioritized(prefix.get("avp"_sr)), timeouts(prefix.get("to"_sr)), + timeout(CLIENT_KNOBS->TASKBUCKET_TIMEOUT_VERSIONS), system_access(sysAccess), priority_batch(priorityBatch), + lockAware(lockAware) {} TaskBucket::~TaskBucket() {} @@ -922,9 +919,7 @@ Key TaskBucket::addTask(Reference tr, Reference for (auto& param : task->params) tr->set(taskSpace.pack(param.key), param.value); - tr->atomicOp(prefix.pack(LiteralStringRef("task_count")), - LiteralStringRef("\x01\x00\x00\x00\x00\x00\x00\x00"), - MutationRef::AddValue); + tr->atomicOp(prefix.pack("task_count"_sr), "\x01\x00\x00\x00\x00\x00\x00\x00"_sr, MutationRef::AddValue); return key; } @@ -996,13 +991,9 @@ Future TaskBucket::isEmpty(Reference tr) { Future TaskBucket::finish(Reference tr, Reference task) { setOptions(tr); - Tuple t; - t.append(task->timeoutVersion); - t.append(task->key); + Tuple t = Tuple::makeTuple(task->timeoutVersion, task->key); - tr->atomicOp(prefix.pack(LiteralStringRef("task_count")), - LiteralStringRef("\xff\xff\xff\xff\xff\xff\xff\xff"), - MutationRef::AddValue); + tr->atomicOp(prefix.pack("task_count"_sr), "\xff\xff\xff\xff\xff\xff\xff\xff"_sr, MutationRef::AddValue); tr->clear(timeouts.range(t)); return Void(); @@ -1033,7 +1024,7 @@ Future TaskBucket::getTaskCount(Reference tr } Future TaskBucket::watchTaskCount(Reference tr) { - return tr->watch(prefix.pack(LiteralStringRef("task_count"))); + return tr->watch(prefix.pack("task_count"_sr)); } Future TaskBucket::debugPrintRange(Reference tr, Subspace subspace, Key msg) { @@ -1108,7 +1099,7 @@ public: Key key = StringRef(deterministicRandom()->randomUniqueID().toString()); taskFuture->addBlock(tr, key); auto task = makeReference(); - task->params[Task::reservedTaskParamKeyType] = LiteralStringRef("UnblockFuture"); + task->params[Task::reservedTaskParamKeyType] = "UnblockFuture"_sr; task->params[Task::reservedTaskParamKeyFuture] = taskFuture->key; task->params[Task::reservedTaskParamKeyBlockID] = key; onSetFutures.push_back(vectorFuture[i]->onSet(tr, taskBucket, task)); @@ -1138,10 +1129,10 @@ public: bool is_set = wait(isSet(tr, taskFuture)); if (is_set) { - TEST(true); // is_set == true + CODE_PROBE(true, "is_set == true"); wait(performAction(tr, taskBucket, taskFuture, task)); } else { - TEST(true); // is_set == false + CODE_PROBE(true, "is_set == false"); Subspace callbackSpace = taskFuture->callbacks.get(StringRef(deterministicRandom()->randomUniqueID().toString())); for (auto& v : task->params) { @@ -1222,7 +1213,7 @@ public: taskFuture->futureBucket->setOptions(tr); task->params[Task::reservedTaskParamKeyAddTask] = task->params[Task::reservedTaskParamKeyType]; - task->params[Task::reservedTaskParamKeyType] = LiteralStringRef("AddTask"); + task->params[Task::reservedTaskParamKeyType] = "AddTask"_sr; wait(onSet(tr, taskBucket, taskFuture, task)); return Void(); @@ -1287,14 +1278,14 @@ TaskFuture::TaskFuture(const Reference bucket, Key k) : futureBuck } prefix = futureBucket->prefix.get(key); - blocks = prefix.get(LiteralStringRef("bl")); - callbacks = prefix.get(LiteralStringRef("cb")); + blocks = prefix.get("bl"_sr); + callbacks = prefix.get("cb"_sr); } TaskFuture::~TaskFuture() {} void TaskFuture::addBlock(Reference tr, StringRef block_id) { - tr->set(blocks.pack(block_id), LiteralStringRef("")); + tr->set(blocks.pack(block_id), ""_sr); } Future TaskFuture::set(Reference tr, Reference taskBucket) { diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp index 1071059114..2ad1989fd0 100644 --- a/fdbclient/Tenant.cpp +++ b/fdbclient/Tenant.cpp @@ -18,47 +18,213 @@ * limitations under the License. */ +#include "fdbclient/NativeAPI.actor.h" #include "fdbclient/SystemData.h" #include "fdbclient/Tenant.h" +#include "fdbrpc/TenantInfo.h" +#include "flow/BooleanParam.h" +#include "flow/IRandom.h" +#include "libb64/decode.h" +#include "libb64/encode.h" +#include "flow/ApiVersion.h" #include "flow/UnitTest.h" +FDB_DEFINE_BOOLEAN_PARAM(EnforceValidTenantId); + Key TenantMapEntry::idToPrefix(int64_t id) { int64_t swapped = bigEndian64(id); - return StringRef(reinterpret_cast(&swapped), 8); + return StringRef(reinterpret_cast(&swapped), TENANT_PREFIX_SIZE); } -int64_t TenantMapEntry::prefixToId(KeyRef prefix) { - ASSERT(prefix.size() == 8); +int64_t TenantMapEntry::prefixToId(KeyRef prefix, EnforceValidTenantId enforceValidTenantId) { + ASSERT(prefix.size() == TENANT_PREFIX_SIZE); int64_t id = *reinterpret_cast(prefix.begin()); id = bigEndian64(id); - ASSERT(id >= 0); + if (enforceValidTenantId) { + ASSERT(id >= 0); + } else if (id < 0) { + return TenantInfo::INVALID_TENANT; + } return id; } -void TenantMapEntry::initPrefix(KeyRef subspace) { - ASSERT(id >= 0); - prefix = makeString(8 + subspace.size()); - uint8_t* data = mutateString(prefix); - if (subspace.size() > 0) { - memcpy(data, subspace.begin(), subspace.size()); +std::string TenantMapEntry::tenantStateToString(TenantState tenantState) { + switch (tenantState) { + case TenantState::REGISTERING: + return "registering"; + case TenantState::READY: + return "ready"; + case TenantState::REMOVING: + return "removing"; + case TenantState::UPDATING_CONFIGURATION: + return "updating configuration"; + case TenantState::RENAMING_FROM: + return "renaming from"; + case TenantState::RENAMING_TO: + return "renaming to"; + case TenantState::ERROR: + return "error"; + default: + UNREACHABLE(); } - int64_t swapped = bigEndian64(id); - memcpy(data + subspace.size(), &swapped, 8); } -TenantMapEntry::TenantMapEntry() : id(-1) {} -TenantMapEntry::TenantMapEntry(int64_t id, KeyRef subspace) : id(id) { - initPrefix(subspace); +TenantState TenantMapEntry::stringToTenantState(std::string stateStr) { + if (stateStr == "registering") { + return TenantState::REGISTERING; + } else if (stateStr == "ready") { + return TenantState::READY; + } else if (stateStr == "removing") { + return TenantState::REMOVING; + } else if (stateStr == "updating configuration") { + return TenantState::UPDATING_CONFIGURATION; + } else if (stateStr == "renaming from") { + return TenantState::RENAMING_FROM; + } else if (stateStr == "renaming to") { + return TenantState::RENAMING_TO; + } else if (stateStr == "error") { + return TenantState::ERROR; + } + + UNREACHABLE(); +} + +std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState) { + switch (tenantState) { + case TenantLockState::UNLOCKED: + return "unlocked"; + case TenantLockState::READ_ONLY: + return "read only"; + case TenantLockState::LOCKED: + return "locked"; + default: + UNREACHABLE(); + } +} + +TenantLockState TenantMapEntry::stringToTenantLockState(std::string stateStr) { + if (stateStr == "unlocked") { + return TenantLockState::UNLOCKED; + } else if (stateStr == "read only") { + return TenantLockState::READ_ONLY; + } else if (stateStr == "locked") { + return TenantLockState::LOCKED; + } + + UNREACHABLE(); +} + +TenantMapEntry::TenantMapEntry() {} +TenantMapEntry::TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted) + : tenantState(tenantState), encrypted(encrypted) { + setId(id); +} +TenantMapEntry::TenantMapEntry(int64_t id, + TenantState tenantState, + Optional tenantGroup, + bool encrypted) + : tenantState(tenantState), tenantGroup(tenantGroup), encrypted(encrypted) { + setId(id); +} + +void TenantMapEntry::setId(int64_t id) { + ASSERT(id >= 0); + this->id = id; + prefix = idToPrefix(id); +} + +std::string TenantMapEntry::toJson() const { + json_spirit::mObject tenantEntry; + tenantEntry["id"] = id; + tenantEntry["encrypted"] = encrypted; + + json_spirit::mObject prefixObject; + std::string encodedPrefix = base64::encoder::from_string(prefix.toString()); + // Remove trailing newline + encodedPrefix.resize(encodedPrefix.size() - 1); + + prefixObject["base64"] = encodedPrefix; + prefixObject["printable"] = printable(prefix); + tenantEntry["prefix"] = prefixObject; + + tenantEntry["tenant_state"] = TenantMapEntry::tenantStateToString(tenantState); + if (assignedCluster.present()) { + tenantEntry["assigned_cluster"] = assignedCluster.get().toString(); + } + if (tenantGroup.present()) { + json_spirit::mObject tenantGroupObject; + std::string encodedTenantGroup = base64::encoder::from_string(tenantGroup.get().toString()); + // Remove trailing newline + encodedTenantGroup.resize(encodedTenantGroup.size() - 1); + + tenantGroupObject["base64"] = encodedTenantGroup; + tenantGroupObject["printable"] = printable(tenantGroup.get()); + tenantEntry["tenant_group"] = tenantGroupObject; + } + + return json_spirit::write_string(json_spirit::mValue(tenantEntry)); +} + +bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const { + return tenantGroup == other.tenantGroup && encrypted == other.encrypted; +} + +void TenantMapEntry::configure(Standalone parameter, Optional value) { + if (parameter == "tenant_group"_sr) { + tenantGroup = value; + } else if (parameter == "assigned_cluster"_sr) { + assignedCluster = value; + } else { + TraceEvent(SevWarnAlways, "UnknownTenantConfigurationParameter").detail("Parameter", parameter); + throw invalid_tenant_configuration(); + } +} + +json_spirit::mObject TenantGroupEntry::toJson() const { + json_spirit::mObject tenantGroupEntry; + if (assignedCluster.present()) { + tenantGroupEntry["assigned_cluster"] = assignedCluster.get().toString(); + } + + return tenantGroupEntry; +} + +TenantMetadataSpecification& TenantMetadata::instance() { + static TenantMetadataSpecification _instance = TenantMetadataSpecification("\xff/"_sr); + return _instance; +} + +Key TenantMetadata::tenantMapPrivatePrefix() { + static Key _prefix = "\xff"_sr.withSuffix(tenantMap().subspace.begin); + return _prefix; +} + +TEST_CASE("/fdbclient/libb64/base64decoder") { + Standalone buf = makeString(100); + for (int i = 0; i < 1000; ++i) { + int length = deterministicRandom()->randomInt(0, 100); + deterministicRandom()->randomBytes(mutateString(buf), length); + + StringRef str = buf.substr(0, length); + std::string encodedStr = base64::encoder::from_string(str.toString()); + // Remove trailing newline + encodedStr.resize(encodedStr.size() - 1); + + std::string decodedStr = base64::decoder::from_string(encodedStr); + ASSERT(decodedStr == str.toString()); + } + + return Void(); } TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { - TenantMapEntry entry1(1, ""_sr); + TenantMapEntry entry1(1, TenantState::READY, false); ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr); TenantMapEntry entry2 = TenantMapEntry::decode(entry1.encode()); ASSERT(entry1.id == entry2.id && entry1.prefix == entry2.prefix); - TenantMapEntry entry3(std::numeric_limits::max(), "foo"_sr); - ASSERT(entry3.prefix == "foo\x7f\xff\xff\xff\xff\xff\xff\xff"_sr); + TenantMapEntry entry3(std::numeric_limits::max(), TenantState::READY, false); + ASSERT(entry3.prefix == "\x7f\xff\xff\xff\xff\xff\xff\xff"_sr); TenantMapEntry entry4 = TenantMapEntry::decode(entry3.encode()); ASSERT(entry3.id == entry4.id && entry3.prefix == entry4.prefix); @@ -68,15 +234,9 @@ TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { int64_t maxPlusOne = std::min(UINT64_C(1) << bits, std::numeric_limits::max()); int64_t id = deterministicRandom()->randomInt64(min, maxPlusOne); - int subspaceLength = deterministicRandom()->randomInt(0, 20); - Standalone subspace = makeString(subspaceLength); - generateRandomData(mutateString(subspace), subspaceLength); - - TenantMapEntry entry(id, subspace); + TenantMapEntry entry(id, TenantState::READY, false); int64_t bigEndianId = bigEndian64(id); - ASSERT(entry.id == id && entry.prefix.startsWith(subspace) && - entry.prefix.endsWith(StringRef(reinterpret_cast(&bigEndianId), 8)) && - entry.prefix.size() == subspaceLength + 8); + ASSERT(entry.id == id && entry.prefix == StringRef(reinterpret_cast(&bigEndianId), 8)); TenantMapEntry decodedEntry = TenantMapEntry::decode(entry.encode()); ASSERT(decodedEntry.id == entry.id && decodedEntry.prefix == entry.prefix); diff --git a/fdbclient/TenantManagement.actor.cpp b/fdbclient/TenantManagement.actor.cpp new file mode 100644 index 0000000000..608da5c690 --- /dev/null +++ b/fdbclient/TenantManagement.actor.cpp @@ -0,0 +1,40 @@ +/* + * TenantManagement.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "fdbclient/SystemData.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/Tuple.h" +#include "flow/actorcompiler.h" // has to be last include + +namespace TenantAPI { + +TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode) { + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + return TenantMode::DISABLED; + } else if (clusterType == ClusterType::METACLUSTER_DATA) { + return TenantMode::REQUIRED; + } else { + return tenantMode; + } +} + +} // namespace TenantAPI diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 1dc5357572..eec04e46e3 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -21,6 +21,7 @@ #include "fdbclient/BlobGranuleFiles.h" #include "fdbclient/ClusterConnectionFile.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/CoordinationInterface.h" #include "fdbclient/ThreadSafeTransaction.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/versions.h" @@ -54,7 +55,7 @@ Reference ThreadSafeDatabase::openTenant(TenantNameRef tenantName) { } Reference ThreadSafeDatabase::createTransaction() { - auto type = isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW; + auto type = isConfigDB ? ISingleThreadTransaction::Type::PAXOS_CONFIG : ISingleThreadTransaction::Type::RYW; return Reference(new ThreadSafeTransaction(db, type, Optional())); } @@ -87,6 +88,7 @@ ThreadFuture ThreadSafeDatabase::rebootWorker(const StringRef& address, DatabaseContext* db = this->db; Key addressKey = address; return onMainThread([db, addressKey, check, duration]() -> Future { + db->checkDeferredError(); return db->rebootWorker(addressKey, check, duration); }); } @@ -94,14 +96,20 @@ ThreadFuture ThreadSafeDatabase::rebootWorker(const StringRef& address, ThreadFuture ThreadSafeDatabase::forceRecoveryWithDataLoss(const StringRef& dcid) { DatabaseContext* db = this->db; Key dcidKey = dcid; - return onMainThread([db, dcidKey]() -> Future { return db->forceRecoveryWithDataLoss(dcidKey); }); + return onMainThread([db, dcidKey]() -> Future { + db->checkDeferredError(); + return db->forceRecoveryWithDataLoss(dcidKey); + }); } ThreadFuture ThreadSafeDatabase::createSnapshot(const StringRef& uid, const StringRef& snapshot_command) { DatabaseContext* db = this->db; Key snapUID = uid; Key cmd = snapshot_command; - return onMainThread([db, snapUID, cmd]() -> Future { return db->createSnapshot(snapUID, cmd); }); + return onMainThread([db, snapUID, cmd]() -> Future { + db->checkDeferredError(); + return db->createSnapshot(snapUID, cmd); + }); } ThreadFuture ThreadSafeDatabase::createSharedState() { @@ -125,14 +133,17 @@ double ThreadSafeDatabase::getMainThreadBusyness() { // Note: this will never return if the server is running a protocol from FDB 5.0 or older ThreadFuture ThreadSafeDatabase::getServerProtocol(Optional expectedVersion) { DatabaseContext* db = this->db; - return onMainThread( - [db, expectedVersion]() -> Future { return db->getClusterProtocol(expectedVersion); }); + return onMainThread([db, expectedVersion]() -> Future { + db->checkDeferredError(); + return db->getClusterProtocol(expectedVersion); + }); } ThreadFuture ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) { DatabaseContext* db = this->db; KeyRange range = keyRange; return onMainThread([db, range, purgeVersion, force]() -> Future { + db->checkDeferredError(); return db->purgeBlobGranules(range, purgeVersion, {}, force); }); } @@ -140,16 +151,64 @@ ThreadFuture ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRa ThreadFuture ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) { DatabaseContext* db = this->db; Key key = purgeKey; - return onMainThread([db, key]() -> Future { return db->waitPurgeGranulesComplete(key); }); + return onMainThread([db, key]() -> Future { + db->checkDeferredError(); + return db->waitPurgeGranulesComplete(key); + }); } -ThreadSafeDatabase::ThreadSafeDatabase(Reference connectionRecord, int apiVersion) { +ThreadFuture ThreadSafeDatabase::blobbifyRange(const KeyRangeRef& keyRange) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future { + db->checkDeferredError(); + return db->blobbifyRange(range); + }); +} + +ThreadFuture ThreadSafeDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future { + db->checkDeferredError(); + return db->unblobbifyRange(range); + }); +} + +ThreadFuture>> ThreadSafeDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future>> { + db->checkDeferredError(); + return db->listBlobbifiedRanges(range, rangeLimit); + }); +} + +ThreadFuture ThreadSafeDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future { + db->checkDeferredError(); + return db->verifyBlobRange(range, version); + }); +} + +ThreadSafeDatabase::ThreadSafeDatabase(ConnectionRecordType connectionRecordType, + std::string connectionRecordString, + int apiVersion) { // Allocate memory for the Database from this thread (so the pointer is known for subsequent method calls) // but run its constructor on the main thread DatabaseContext* db = this->db = DatabaseContext::allocateOnForeignThread(); - onMainThreadVoid([db, connectionRecord, apiVersion]() { + onMainThreadVoid([db, connectionRecordType, connectionRecordString, apiVersion]() { try { + Reference connectionRecord = + connectionRecordType == ConnectionRecordType::FILE + ? Reference(ClusterConnectionFile::openOrDefault(connectionRecordString)) + : Reference( + new ClusterConnectionMemoryRecord(ClusterConnectionString(connectionRecordString))); + Database::createDatabase(connectionRecord, apiVersion, IsInternal::False, LocalityData(), db).extractPtr(); } catch (Error& e) { new (db) DatabaseContext(e); @@ -165,7 +224,7 @@ ThreadSafeDatabase::~ThreadSafeDatabase() { } Reference ThreadSafeTenant::createTransaction() { - auto type = db->isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW; + auto type = db->isConfigDB ? ISingleThreadTransaction::Type::PAXOS_CONFIG : ISingleThreadTransaction::Type::RYW; return Reference(new ThreadSafeTransaction(db->db, type, name)); } @@ -181,7 +240,10 @@ ThreadFuture ThreadSafeTenant::purgeBlobGranules(const KeyRangeRef& keyRang ThreadFuture ThreadSafeTenant::waitPurgeGranulesComplete(const KeyRef& purgeKey) { DatabaseContext* db = this->db->db; Key key = purgeKey; - return onMainThread([db, key]() -> Future { return db->waitPurgeGranulesComplete(key); }); + return onMainThread([db, key]() -> Future { + db->checkDeferredError(); + return db->waitPurgeGranulesComplete(key); + }); } ThreadSafeTenant::~ThreadSafeTenant() {} @@ -350,13 +412,14 @@ ThreadFuture>> ThreadSafeTransaction::getAddre } ThreadFuture>> ThreadSafeTransaction::getBlobGranuleRanges( - const KeyRangeRef& keyRange) { + const KeyRangeRef& keyRange, + int rangeLimit) { ISingleThreadTransaction* tr = this->tr; KeyRange r = keyRange; - return onMainThread([tr, r]() -> Future>> { + return onMainThread([=]() -> Future>> { tr->checkDeferredError(); - return tr->getBlobGranuleRanges(r); + return tr->getBlobGranuleRanges(r, rangeLimit); }); } @@ -364,34 +427,53 @@ ThreadResult ThreadSafeTransaction::readBlobGranules(const KeyRange Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) { - // FIXME: prevent from calling this from another main thread! + // This should not be called directly, bypassMultiversionApi should not be set + return ThreadResult(unsupported_operation()); +} +ThreadFuture>> ThreadSafeTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { ISingleThreadTransaction* tr = this->tr; KeyRange r = keyRange; - int64_t readVersionOut; - ThreadFuture>> getFilesFuture = onMainThread( - [tr, r, beginVersion, readVersion, &readVersionOut]() -> Future>> { + return onMainThread( + [tr, r, beginVersion, readVersion, readVersionOut]() -> Future>> { tr->checkDeferredError(); - return tr->readBlobGranules(r, beginVersion, readVersion, &readVersionOut); + return tr->readBlobGranules(r, beginVersion, readVersion, readVersionOut); }); +} - // FIXME: can this safely avoid another main thread jump? - getFilesFuture.blockUntilReadyCheckOnMainThread(); - - // propagate error to client - if (getFilesFuture.isError()) { - return ThreadResult(getFilesFuture.getError()); - } - - Standalone> files = getFilesFuture.get(); - +ThreadResult ThreadSafeTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { // do this work off of fdb network threads for performance! - if (granule_context.debugNoMaterialize) { - return ThreadResult(blob_granule_not_materialized()); - } else { - return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersionOut, granule_context); + Standalone> files = startFuture.get(); + GranuleMaterializeStats stats; + auto ret = loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersion, granuleContext, stats); + if (!ret.isError()) { + ISingleThreadTransaction* tr = this->tr; + onMainThreadVoid([tr, stats]() { tr->addGranuleMaterializeStats(stats); }); } + return ret; +} + +ThreadFuture>> ThreadSafeTransaction::summarizeBlobGranules( + const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) { + ISingleThreadTransaction* tr = this->tr; + KeyRange r = keyRange; + + return onMainThread([=]() -> Future>> { + tr->checkDeferredError(); + return tr->summarizeBlobGranules(r, summaryVersion, rangeLimit); + }); } void ThreadSafeTransaction::addReadConflictRange(const KeyRangeRef& keys) { @@ -489,22 +571,34 @@ Version ThreadSafeTransaction::getCommittedVersion() { ThreadFuture ThreadSafeTransaction::getVersionVector() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future { return tr->getVersionVector(); }); + return onMainThread([tr]() -> Future { + tr->checkDeferredError(); + return tr->getVersionVector(); + }); } ThreadFuture ThreadSafeTransaction::getSpanContext() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future { return tr->getSpanContext(); }); + return onMainThread([tr]() -> Future { + tr->checkDeferredError(); + return tr->getSpanContext(); + }); } ThreadFuture ThreadSafeTransaction::getApproximateSize() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future { return tr->getApproximateSize(); }); + return onMainThread([tr]() -> Future { + tr->checkDeferredError(); + return tr->getApproximateSize(); + }); } ThreadFuture> ThreadSafeTransaction::getVersionstamp() { ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr]() -> Future> { return tr->getVersionstamp(); }); + return onMainThread([tr]() -> Future> { + tr->checkDeferredError(); + return tr->getVersionstamp(); + }); } void ThreadSafeTransaction::setOption(FDBTransactionOptions::Option option, Optional value) { @@ -563,19 +657,25 @@ void ThreadSafeTransaction::reset() { extern const char* getSourceVersion(); -ThreadSafeApi::ThreadSafeApi() - : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion)), - transportId(0) {} +ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), transportId(0) {} void ThreadSafeApi::selectApiVersion(int apiVersion) { - this->apiVersion = apiVersion; + this->apiVersion = ApiVersion(apiVersion); } const char* ThreadSafeApi::getClientVersion() { - // There is only one copy of the ThreadSafeAPI, and it never gets deleted. Also, clientVersion is never modified. + // There is only one copy of the ThreadSafeAPI, and it never gets deleted. + // Also, clientVersion is initialized on demand and never modified afterwards. + if (clientVersion.empty()) { + clientVersion = format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion()); + } return clientVersion.c_str(); } +void ThreadSafeApi::useFutureProtocolVersion() { + ::useFutureProtocolVersion(); +} + void ThreadSafeApi::setNetworkOption(FDBNetworkOptions::Option option, Optional value) { if (option == FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID) { if (value.present()) { @@ -632,12 +732,12 @@ void ThreadSafeApi::stopNetwork() { Reference ThreadSafeApi::createDatabase(const char* clusterFilePath) { return Reference( - new ThreadSafeDatabase(ClusterConnectionFile::openOrDefault(clusterFilePath), apiVersion)); + new ThreadSafeDatabase(ThreadSafeDatabase::ConnectionRecordType::FILE, clusterFilePath, apiVersion.version())); } Reference ThreadSafeApi::createDatabaseFromConnectionString(const char* connectionString) { return Reference(new ThreadSafeDatabase( - makeReference(ClusterConnectionString(connectionString)), apiVersion)); + ThreadSafeDatabase::ConnectionRecordType::CONNECTION_STRING, connectionString, apiVersion.version())); } void ThreadSafeApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) { diff --git a/fdbclient/Tracing.actor.cpp b/fdbclient/Tracing.actor.cpp index 815e568acb..3100b208fc 100644 --- a/fdbclient/Tracing.actor.cpp +++ b/fdbclient/Tracing.actor.cpp @@ -497,7 +497,7 @@ TEST_CASE("/flow/Tracing/AddEvents") { auto arena = span1.arena; SmallVectorRef attrs; attrs.push_back(arena, KeyValueRef("foo"_sr, "bar"_sr)); - span1.addEvent(LiteralStringRef("read_version"), 1.0, attrs); + span1.addEvent("read_version"_sr, 1.0, attrs); ASSERT(span1.events[0].name.toString() == "read_version"); ASSERT(span1.events[0].time == 1.0); ASSERT(span1.events[0].attributes.begin()->key.toString() == "foo"); @@ -505,7 +505,7 @@ TEST_CASE("/flow/Tracing/AddEvents") { // Use helper method to add an OTELEventRef with no attributes to an OTELSpan Span span2("span_with_event"_loc); - span2.addEvent(StringRef(span2.arena, LiteralStringRef("commit_succeed")), 1234567.100); + span2.addEvent(StringRef(span2.arena, "commit_succeed"_sr), 1234567.100); ASSERT(span2.events[0].name.toString() == "commit_succeed"); ASSERT(span2.events[0].time == 1234567.100); ASSERT(span2.events[0].attributes.size() == 0); @@ -537,8 +537,8 @@ TEST_CASE("/flow/Tracing/AddAttributes") { IKnobCollection::getMutableGlobalKnobCollection().setKnob("tracing_span_attributes_enabled", KnobValueRef::create(bool{ true })); auto arena = span1.arena; - span1.addAttribute(StringRef(arena, LiteralStringRef("foo")), StringRef(arena, LiteralStringRef("bar"))); - span1.addAttribute(StringRef(arena, LiteralStringRef("operation")), StringRef(arena, LiteralStringRef("grv"))); + span1.addAttribute(StringRef(arena, "foo"_sr), StringRef(arena, "bar"_sr)); + span1.addAttribute(StringRef(arena, "operation"_sr), StringRef(arena, "grv"_sr)); ASSERT_EQ(span1.attributes.size(), 3); // Includes default attribute of "address" ASSERT(span1.attributes[1] == KeyValueRef("foo"_sr, "bar"_sr)); ASSERT(span1.attributes[2] == KeyValueRef("operation"_sr, "grv"_sr)); @@ -548,9 +548,9 @@ TEST_CASE("/flow/Tracing/AddAttributes") { deterministicRandom()->randomUInt64(), TraceFlags::sampled)); auto s2Arena = span2.arena; - span2.addAttribute(StringRef(s2Arena, LiteralStringRef("a")), StringRef(s2Arena, LiteralStringRef("1"))) - .addAttribute(StringRef(s2Arena, LiteralStringRef("b")), LiteralStringRef("2")) - .addAttribute(StringRef(s2Arena, LiteralStringRef("c")), LiteralStringRef("3")); + span2.addAttribute(StringRef(s2Arena, "a"_sr), StringRef(s2Arena, "1"_sr)) + .addAttribute(StringRef(s2Arena, "b"_sr), "2"_sr) + .addAttribute(StringRef(s2Arena, "c"_sr), "3"_sr); ASSERT_EQ(span2.attributes.size(), 4); // Includes default attribute of "address" ASSERT(span2.attributes[1] == KeyValueRef("a"_sr, "1"_sr)); @@ -718,7 +718,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") { attrs.push_back(s3Arena, KeyValueRef("foo"_sr, "bar"_sr)); span3.addAttribute("operation"_sr, "grv"_sr) .addLink(SpanContext(UID(300, 301), 400, TraceFlags::sampled)) - .addEvent(StringRef(s3Arena, LiteralStringRef("event1")), 100.101, attrs); + .addEvent(StringRef(s3Arena, "event1"_sr), 100.101, attrs); tracer.serialize_span(span3, request); data = request.buffer.get(); ASSERT(data[0] == 0b10011100); // 12 element array. diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp index 729c910252..212223abd2 100644 --- a/fdbclient/Tuple.cpp +++ b/fdbclient/Tuple.cpp @@ -19,8 +19,11 @@ */ #include "fdbclient/Tuple.h" +#include "flow/UnitTest.h" const uint8_t VERSIONSTAMP_96_CODE = 0x33; +const uint8_t USER_TYPE_START = 0x40; +const uint8_t USER_TYPE_END = 0x4f; // TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point. static float bigEndianFloat(float orig) { @@ -58,7 +61,7 @@ static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) { } } -Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { +Tuple::Tuple(StringRef const& str, bool exclude_incomplete, bool include_user_type) { data.append(data.arena(), str.begin(), str.size()); size_t i = 0; @@ -79,6 +82,9 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { i += 1; } else if (data[i] == VERSIONSTAMP_96_CODE) { i += VERSIONSTAMP_TUPLE_SIZE + 1; + } else if (include_user_type && isUserType(data[i])) { + // User defined codes must come at the end of a Tuple and are not delimited. + i = data.size(); } else { throw invalid_tuple_data_type(); } @@ -93,6 +99,14 @@ Tuple Tuple::unpack(StringRef const& str, bool exclude_incomplete) { return Tuple(str, exclude_incomplete); } +Tuple Tuple::unpackUserType(StringRef const& str, bool exclude_incomplete) { + return Tuple(str, exclude_incomplete, true); +} + +bool Tuple::isUserType(uint8_t code) const { + return code >= USER_TYPE_START && code <= USER_TYPE_END; +} + Tuple& Tuple::append(Tuple const& tuple) { for (size_t offset : tuple.offsets) { offsets.push_back(offset + data.size()); @@ -103,7 +117,7 @@ Tuple& Tuple::append(Tuple const& tuple) { return *this; } -Tuple& Tuple::appendVersionstamp(Versionstamp const& vs) { +Tuple& Tuple::append(Versionstamp const& vs) { offsets.push_back(data.size()); data.push_back(data.arena(), VERSIONSTAMP_96_CODE); @@ -134,6 +148,10 @@ Tuple& Tuple::append(StringRef const& str, bool utf8) { return *this; } +Tuple& Tuple::append(UnicodeStr const& str) { + return append(str.str, true); +} + Tuple& Tuple::appendRaw(StringRef const& str) { offsets.push_back(data.size()); @@ -166,7 +184,11 @@ Tuple& Tuple::append(int64_t value) { return *this; } -Tuple& Tuple::appendBool(bool value) { +Tuple& Tuple::append(int32_t value) { + return append((int64_t)value); +} + +Tuple& Tuple::append(bool value) { offsets.push_back(data.size()); if (value) { data.push_back(data.arena(), 0x27); @@ -176,7 +198,7 @@ Tuple& Tuple::appendBool(bool value) { return *this; } -Tuple& Tuple::appendFloat(float value) { +Tuple& Tuple::append(float value) { offsets.push_back(data.size()); float swap = bigEndianFloat(value); uint8_t* bytes = (uint8_t*)&swap; @@ -187,7 +209,7 @@ Tuple& Tuple::appendFloat(float value) { return *this; } -Tuple& Tuple::appendDouble(double value) { +Tuple& Tuple::append(double value) { offsets.push_back(data.size()); double swap = value; swap = bigEndianDouble(swap); @@ -199,12 +221,25 @@ Tuple& Tuple::appendDouble(double value) { return *this; } -Tuple& Tuple::appendNull() { +Tuple& Tuple::append(std::nullptr_t) { offsets.push_back(data.size()); data.push_back(data.arena(), (uint8_t)'\x00'); return *this; } +Tuple& Tuple::appendNull() { + return append(nullptr); +} + +Tuple& Tuple::append(Tuple::UserTypeStr const& udt) { + offsets.push_back(data.size()); + ASSERT(isUserType(udt.code)); + data.push_back(data.arena(), udt.code); + data.append(data.arena(), udt.str.begin(), udt.str.size()); + + return *this; +} + Tuple::ElementType Tuple::getType(size_t index) const { if (index >= offsets.size()) { throw invalid_tuple_index(); @@ -228,6 +263,8 @@ Tuple::ElementType Tuple::getType(size_t index) const { return ElementType::BOOL; } else if (code == VERSIONSTAMP_96_CODE) { return ElementType::VERSIONSTAMP; + } else if (isUserType(code)) { + return ElementType::USER_TYPE; } else { throw invalid_tuple_data_type(); } @@ -388,6 +425,29 @@ Versionstamp Tuple::getVersionstamp(size_t index) const { return Versionstamp(StringRef(data.begin() + offsets[index] + 1, VERSIONSTAMP_TUPLE_SIZE)); } +Tuple::UserTypeStr Tuple::getUserType(size_t index) const { + // Valid index. + if (index >= offsets.size()) { + throw invalid_tuple_index(); + } + + // Valid user type code. + ASSERT_LT(offsets[index], data.size()); + uint8_t code = data[offsets[index]]; + if (!isUserType(code)) { + throw invalid_tuple_data_type(); + } + + size_t start = offsets[index] + 1; + + Standalone str; + VectorRef staging; + staging.append(str.arena(), data.begin() + start, data.size() - start); + str.StringRef::operator=(StringRef(staging.begin(), staging.size())); + + return Tuple::UserTypeStr(code, str); +} + KeyRange Tuple::range(Tuple const& tuple) const { VectorRef begin; VectorRef end; @@ -426,3 +486,76 @@ StringRef Tuple::subTupleRawString(size_t index) const { size_t endPos = end < offsets.size() ? offsets[end] : data.size(); return StringRef(data.begin() + offsets[index], endPos - offsets[index]); } + +TEST_CASE("/fdbclient/Tuple/makeTuple") { + Tuple t1 = Tuple::makeTuple(1, + 1.0f, + 1.0, + false, + "byteStr"_sr, + Tuple::UnicodeStr("str"_sr), + nullptr, + Versionstamp("000000000000"_sr), + Tuple::UserTypeStr(0x41, "12345678"_sr)); + Tuple t2 = Tuple() + .append(1) + .append(1.0f) + .append(1.0) + .append(false) + .append("byteStr"_sr) + .append(Tuple::UnicodeStr("str"_sr)) + .append(nullptr) + .append(Versionstamp("000000000000"_sr)) + .append(Tuple::UserTypeStr(0x41, "12345678"_sr)); + + ASSERT(t1.pack() == t2.pack()); + ASSERT(t1.getType(0) == Tuple::INT); + ASSERT(t1.getType(1) == Tuple::FLOAT); + ASSERT(t1.getType(2) == Tuple::DOUBLE); + ASSERT(t1.getType(3) == Tuple::BOOL); + ASSERT(t1.getType(4) == Tuple::BYTES); + ASSERT(t1.getType(5) == Tuple::UTF8); + ASSERT(t1.getType(6) == Tuple::NULL_TYPE); + ASSERT(t1.getType(7) == Tuple::VERSIONSTAMP); + ASSERT(t1.getType(8) == Tuple::USER_TYPE); + ASSERT(t1.size() == 9); + + return Void(); +} + +TEST_CASE("/fdbclient/Tuple/unpack") { + Tuple t1 = Tuple::makeTuple(1, + 1.0f, + 1.0, + false, + "byteStr"_sr, + Tuple::UnicodeStr("str"_sr), + nullptr, + Versionstamp("000000000000"_sr), + Tuple::UserTypeStr(0x41, "12345678"_sr)); + + Standalone packed = t1.pack(); + Tuple t2 = Tuple::unpackUserType(packed); + ASSERT(t2.pack() == t1.pack()); + ASSERT(t2.getInt(0) == t1.getInt(0)); + ASSERT(t2.getFloat(1) == t1.getFloat(1)); + ASSERT(t2.getDouble(2) == t1.getDouble(2)); + ASSERT(t2.getBool(3) == t1.getBool(3)); + ASSERT(t2.getString(4) == t1.getString(4)); + ASSERT(t2.getString(5) == t1.getString(5)); + ASSERT(t2.getType(6) == Tuple::NULL_TYPE); + ASSERT(t2.getVersionstamp(7) == t1.getVersionstamp(7)); + ASSERT(t2.getUserType(8) == t1.getUserType(8)); + ASSERT(t2.size() == 9); + + try { + Tuple t3 = Tuple::unpack(packed); + ASSERT(false); + } catch (Error& e) { + if (e.code() != error_code_invalid_tuple_data_type) { + throw e; + } + } + + return Void(); +} diff --git a/fdbclient/WriteMap.cpp b/fdbclient/WriteMap.cpp index dcb6ee9581..56aedd8e8c 100644 --- a/fdbclient/WriteMap.cpp +++ b/fdbclient/WriteMap.cpp @@ -567,7 +567,7 @@ void WriteMap::clearNoConflict(KeyRangeRef keys) { bool end_conflict = it.is_conflict_range(); bool end_unreadable = it.is_unreadable(); - TEST(it.is_conflict_range() != lastConflicted); // not last conflicted + CODE_PROBE(it.is_conflict_range() != lastConflicted, "not last conflicted"); it.tree.clear(); diff --git a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp index 0c3ac8ba02..8bd8f94872 100644 --- a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp +++ b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp @@ -29,7 +29,7 @@ namespace { std::string const notFoundErrorCode = "404"; void printAzureError(std::string const& operationName, azure::storage_lite::storage_error const& err) { - printf("(%s) : Error from Azure SDK : %s (%s) : %s", + printf("(%s) : Error from Azure SDK : %s (%s) : %s\n", operationName.c_str(), err.code_name.c_str(), err.code.c_str(), @@ -109,9 +109,9 @@ public: class WriteFile final : public IAsyncFile, ReferenceCounted { AsyncTaskThread* asyncTaskThread; - std::shared_ptr client; std::string containerName; std::string blobName; + std::shared_ptr client; int64_t m_cursor{ 0 }; // Ideally this buffer should not be a string, but // the Azure SDK only supports/tests uploading to append @@ -318,7 +318,7 @@ BackupContainerAzureBlobStore::BackupContainerAzureBlobStore(const std::string& std::string accountKey = _accountKey; auto credential = std::make_shared(accountName, accountKey); auto storageAccount = std::make_shared( - accountName, credential, true, format("https://%s", endpoint.c_str())); + accountName, credential, true, fmt::format("https://{}", endpoint)); client = std::make_unique(storageAccount, 1); } @@ -333,12 +333,16 @@ Future BackupContainerAzureBlobStore::create() { TraceEvent(SevDebug, "BCAzureBlobStoreCreateContainer").detail("ContainerName", containerName); Future createContainerFuture = asyncTaskThread.execAsync([containerName = this->containerName, client = this->client] { - waitAzureFuture(client->create_container(containerName), "create_container"); + auto outcome = client->get_container_properties(containerName).get(); + if (!outcome.success()) { + waitAzureFuture(client->create_container(containerName), "create_container"); + } return Void(); }); Future encryptionSetupFuture = usesEncryption() ? encryptionSetupComplete() : Void(); return createContainerFuture && encryptionSetupFuture; } + Future BackupContainerAzureBlobStore::exists() { TraceEvent(SevDebug, "BCAzureBlobStoreCheckContainerExists").detail("ContainerName", containerName); return asyncTaskThread.execAsync([containerName = this->containerName, client = this->client] { diff --git a/fdbclient/azure_backup/README.md b/fdbclient/azure_backup/README.md new file mode 100644 index 0000000000..4a34683674 --- /dev/null +++ b/fdbclient/azure_backup/README.md @@ -0,0 +1,33 @@ +# Set up the Azure Backup Testing Environment + +Make sure we built FDB with `-DBUILD_AZURE_BACKUP=ON` + +# Test + +If you run _BackupToBlob_ and _RestoreFromBlob_ workloads with the paramter _backupURL_ starts with `azure://`, +the workload will backup to and restore from the azure blob storage. +For example, _BackupAzureBlobCorrectness.toml_ + +## Url format + +The code now supports the following style urls: + +- `azure://.blob.core.windows.net/` (The formal url format for the blob service provided by the azure storage account) +- `azure://://` (Directly providing the endpoint address for the blob service, usually for local testing) + +## Local test environment + +We need to use the _Azurite_ to simulate an Azure blob service locally. +Please follow the [turtorial](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=docker-hub) to start your service locally. + +For example, +``` +docker run -p 10000:10000 -v `pwd`: -w mcr.microsoft.com/azure-storage/azurite azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --oauth basic --cert ./<...>.pem --key ./<...>.key.pem --debug ./ +``` + +### Notice + +- To use uses _https_, we need to provide the certificates via `--cert` and `--key` + The detailed [turtorial](https://github.com/Azure/Azurite/blob/main/README.md#https-setup) to setup HTTPS. (We tested with the `mkcert` method) +- To use Azure SDKs, we need to pass `--oauth basic` option +- Please take a look at the [difference](https://github.com/Azure/Azurite/blob/main/README.md#differences-between-azurite-and-azure-storage) between Azurite and Azure Storage diff --git a/fdbclient/azurestorage.cmake b/fdbclient/azurestorage.cmake index 36f8e24f6e..b967824948 100644 --- a/fdbclient/azurestorage.cmake +++ b/fdbclient/azurestorage.cmake @@ -1,3 +1,5 @@ +cmake_minimum_required(VERSION 3.13) + project(azurestorage-download) include(ExternalProject) diff --git a/fdbclient/include/fdbclient/AsyncFileS3BlobStore.actor.h b/fdbclient/include/fdbclient/AsyncFileS3BlobStore.actor.h index 4244992666..3a17528358 100644 --- a/fdbclient/include/fdbclient/AsyncFileS3BlobStore.actor.h +++ b/fdbclient/include/fdbclient/AsyncFileS3BlobStore.actor.h @@ -36,8 +36,8 @@ #include "flow/Net2Packet.h" #include "flow/IRateControl.h" #include "fdbclient/S3BlobStore.h" -#include "fdbclient/md5/md5.h" -#include "fdbclient/libb64/encode.h" +#include "md5/md5.h" +#include "libb64/encode.h" #include "flow/actorcompiler.h" // This must be the last #include. ACTOR template diff --git a/fdbclient/include/fdbclient/Atomic.h b/fdbclient/include/fdbclient/Atomic.h index 6643bcdafc..61f948e38f 100644 --- a/fdbclient/include/fdbclient/Atomic.h +++ b/fdbclient/include/fdbclient/Atomic.h @@ -120,7 +120,7 @@ inline ValueRef doAppendIfFits(const Optional& existingValueOptional, if (!otherOperand.size()) return existingValue; if (existingValue.size() + otherOperand.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT) { - TEST(true) // AppendIfFIts resulted in truncation + CODE_PROBE(true, "AppendIfFits resulted in truncation"); return existingValue; } diff --git a/fdbclient/include/fdbclient/BackupAgent.actor.h b/fdbclient/include/fdbclient/BackupAgent.actor.h index 573c9c2c19..314f151fd0 100644 --- a/fdbclient/include/fdbclient/BackupAgent.actor.h +++ b/fdbclient/include/fdbclient/BackupAgent.actor.h @@ -143,7 +143,7 @@ public: futureBucket = std::move(r.futureBucket); } - KeyBackedProperty lastBackupTimestamp() { return config.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty lastBackupTimestamp() { return config.pack(__FUNCTION__sr); } Future run(Database cx, double pollDelay, int maxConcurrentTasks) { return taskBucket->run(cx, futureBucket, std::make_shared(pollDelay), maxConcurrentTasks); @@ -208,33 +208,14 @@ public: WaitForComplete waitForComplete = WaitForComplete::True, Version targetVersion = ::invalidVersion, Verbose verbose = Verbose::True, - KeyRange range = normalKeys, + KeyRange range = KeyRange(), Key addPrefix = Key(), Key removePrefix = Key(), LockDB lockDB = LockDB::True, OnlyApplyMutationLogs onlyApplyMutationLogs = OnlyApplyMutationLogs::False, InconsistentSnapshotOnly inconsistentSnapshotOnly = InconsistentSnapshotOnly::False, Version beginVersion = ::invalidVersion, - Optional const& encryptionKeyFileName = {}) { - Standalone> rangeRef; - rangeRef.push_back_deep(rangeRef.arena(), range); - return restore(cx, - cxOrig, - tagName, - url, - proxy, - rangeRef, - waitForComplete, - targetVersion, - verbose, - addPrefix, - removePrefix, - lockDB, - onlyApplyMutationLogs, - inconsistentSnapshotOnly, - beginVersion, - encryptionKeyFileName); - } + Optional const& encryptionKeyFileName = {}); Future atomicRestore(Database cx, Key tagName, Standalone> ranges, @@ -242,13 +223,10 @@ public: Key removePrefix = Key()); Future atomicRestore(Database cx, Key tagName, - KeyRange range = normalKeys, + KeyRange range = KeyRange(), Key addPrefix = Key(), - Key removePrefix = Key()) { - Standalone> rangeRef; - rangeRef.push_back_deep(rangeRef.arena(), range); - return atomicRestore(cx, tagName, rangeRef, addPrefix, removePrefix); - } + Key removePrefix = Key()); + // Tries to abort the restore for a tag. Returns the final (stable) state of the tag. Future abortRestore(Reference tr, Key tagName); Future abortRestore(Database cx, Key tagName); @@ -272,6 +250,7 @@ public: int snapshotIntervalSeconds, std::string const& tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone = StopWhenDone::True, UsePartitionedLog = UsePartitionedLog::False, IncrementalBackupOnly = IncrementalBackupOnly::False, @@ -283,6 +262,7 @@ public: int snapshotIntervalSeconds, std::string const& tagName, Standalone> backupRanges, + bool encryptionEnabled, StopWhenDone stopWhenDone = StopWhenDone::True, UsePartitionedLog partitionedLog = UsePartitionedLog::False, IncrementalBackupOnly incrementalBackupOnly = IncrementalBackupOnly::False, @@ -295,6 +275,7 @@ public: snapshotIntervalSeconds, tagName, backupRanges, + encryptionEnabled, stopWhenDone, partitionedLog, incrementalBackupOnly, @@ -361,7 +342,7 @@ public: template <> inline Standalone TupleCodec::pack( FileBackupAgent::ERestoreState const& val) { - return Tuple().append(val).pack(); + return Tuple::makeTuple(static_cast(val)).pack(); } template <> inline FileBackupAgent::ERestoreState TupleCodec::unpack( @@ -578,7 +559,7 @@ ACTOR Future cleanupBackup(Database cx, DeleteData deleteData); using EBackupState = BackupAgentBase::EnumState; template <> inline Standalone TupleCodec::pack(EBackupState const& val) { - return Tuple().append(static_cast(val)).pack(); + return Tuple::makeTuple(static_cast(val)).pack(); } template <> inline EBackupState TupleCodec::unpack(Standalone const& val) { @@ -621,7 +602,7 @@ class TagUidMap : public KeyBackedMap { Snapshot snapshot); public: - TagUidMap(const StringRef& prefix) : TagMap(LiteralStringRef("tag->uid/").withPrefix(prefix)), prefix(prefix) {} + TagUidMap(const StringRef& prefix) : TagMap("tag->uid/"_sr.withPrefix(prefix)), prefix(prefix) {} Future> getAll(Reference tr, Snapshot snapshot = Snapshot::False) { @@ -652,11 +633,11 @@ static inline Future> getAllBackupTags(Reference uid() { return LiteralStringRef(__FUNCTION__); } + static TaskParam uid() { return __FUNCTION__sr; } } TaskParams; KeyBackedConfig(StringRef prefix, UID uid = UID()) - : uid(uid), prefix(prefix), configSpace(uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(prefix), uid)) {} + : uid(uid), prefix(prefix), configSpace(uidPrefixKey("uid->config/"_sr.withPrefix(prefix), uid)) {} KeyBackedConfig(StringRef prefix, Reference task) : KeyBackedConfig(prefix, TaskParams.uid().get(task)) {} @@ -685,7 +666,7 @@ public: }); } - KeyBackedProperty tag() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty tag() { return configSpace.pack(__FUNCTION__sr); } UID getUid() { return uid; } @@ -694,12 +675,10 @@ public: void clear(Reference tr) { tr->clear(configSpace.range()); } // lastError is a pair of error message and timestamp expressed as an int64_t - KeyBackedProperty> lastError() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty> lastError() { return configSpace.pack(__FUNCTION__sr); } KeyBackedMap> lastErrorPerType() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); + return configSpace.pack(__FUNCTION__sr); } // Updates the error per type map and the last error property @@ -727,8 +706,7 @@ protected: template <> inline Standalone TupleCodec>::pack(Reference const& bc) { - Tuple tuple; - tuple.append(StringRef(bc->getURL())); + Tuple tuple = Tuple::makeTuple(bc->getURL()); if (bc->getEncryptionKeyFileName().present()) { tuple.append(bc->getEncryptionKeyFileName().get()); @@ -775,9 +753,7 @@ public: Version version; std::string fileName; int64_t fileSize; - Tuple pack() const { - return Tuple().append(begin).append(version).append(StringRef(fileName)).append(fileSize); - } + Tuple pack() const { return Tuple::makeTuple(begin, version, fileName, fileSize); } static RangeSlice unpack(Tuple const& t) { RangeSlice r; int i = 0; @@ -791,47 +767,41 @@ public: // Map of range end boundaries to info about the backup file written for that range. typedef KeyBackedMap RangeFileMapT; - RangeFileMapT snapshotRangeFileMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + RangeFileMapT snapshotRangeFileMap() { return configSpace.pack(__FUNCTION__sr); } // Number of kv range files that were both committed to persistent storage AND inserted into // the snapshotRangeFileMap. Note that since insertions could replace 1 or more existing // map entries this is not necessarily the number of entries currently in the map. // This value exists to help with sizing of kv range folders for BackupContainers that // require it. - KeyBackedBinaryValue snapshotRangeFileCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue snapshotRangeFileCount() { return configSpace.pack(__FUNCTION__sr); } // Coalesced set of ranges already dispatched for writing. typedef KeyBackedMap RangeDispatchMapT; - RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(__FUNCTION__sr); } // Interval to use for the first (initial) snapshot. - KeyBackedProperty initialSnapshotIntervalSeconds() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty initialSnapshotIntervalSeconds() { return configSpace.pack(__FUNCTION__sr); } // Interval to use for determining the target end version for new snapshots - KeyBackedProperty snapshotIntervalSeconds() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotIntervalSeconds() { return configSpace.pack(__FUNCTION__sr); } // When the current snapshot began - KeyBackedProperty snapshotBeginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBeginVersion() { return configSpace.pack(__FUNCTION__sr); } // When the current snapshot is desired to end. // This can be changed at runtime to speed up or slow down a snapshot - KeyBackedProperty snapshotTargetEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotTargetEndVersion() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotBatchSize() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBatchSize() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotBatchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBatchFuture() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotBatchDispatchDoneKey() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty snapshotBatchDispatchDoneKey() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotDispatchLastShardsBehind() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty snapshotDispatchLastShardsBehind() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty snapshotDispatchLastVersion() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty snapshotDispatchLastVersion() { return configSpace.pack(__FUNCTION__sr); } Future initNewSnapshot(Reference tr, int64_t intervalSeconds = -1) { BackupConfig& copy = *this; // Capture this by value instead of this ptr @@ -865,51 +835,50 @@ public: }); } - KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue rangeBytesWritten() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedBinaryValue logBytesWritten() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty stateEnum() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty stateEnum() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty> backupContainer() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty> backupContainer() { return configSpace.pack(__FUNCTION__sr); } // Set to true when all backup workers for saving mutation logs have been started. - KeyBackedProperty allWorkerStarted() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty allWorkerStarted() { return configSpace.pack(__FUNCTION__sr); } // Each backup worker adds its (epoch, tag.id) to this property. KeyBackedProperty>> startedBackupWorkers() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); + return configSpace.pack(__FUNCTION__sr); } // Set to true if backup worker is enabled. - KeyBackedProperty backupWorkerEnabled() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty backupWorkerEnabled() { return configSpace.pack(__FUNCTION__sr); } // Set to true if partitioned log is enabled (only useful if backup worker is also enabled). - KeyBackedProperty partitionedLogEnabled() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty partitionedLogEnabled() { return configSpace.pack(__FUNCTION__sr); } // Set to true if only requesting incremental backup without base snapshot. - KeyBackedProperty incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty incrementalBackupOnly() { return configSpace.pack(__FUNCTION__sr); } // Latest version for which all prior versions have saved by backup workers. - KeyBackedProperty latestBackupWorkerSavedVersion() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty latestBackupWorkerSavedVersion() { return configSpace.pack(__FUNCTION__sr); } // Stop differntial logging if already started or don't start after completing KV ranges - KeyBackedProperty stopWhenDone() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty stopWhenDone() { return configSpace.pack(__FUNCTION__sr); } + + // Enable snapshot backup file encryption + KeyBackedProperty enableSnapshotBackupEncryption() { return configSpace.pack(__FUNCTION__sr); } // Latest version for which all prior versions have had their log copy tasks completed - KeyBackedProperty latestLogEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty latestLogEndVersion() { return configSpace.pack(__FUNCTION__sr); } // The end version of the last complete snapshot - KeyBackedProperty latestSnapshotEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty latestSnapshotEndVersion() { return configSpace.pack(__FUNCTION__sr); } // The end version of the first complete snapshot - KeyBackedProperty firstSnapshotEndVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty firstSnapshotEndVersion() { return configSpace.pack(__FUNCTION__sr); } - KeyBackedProperty destUidValue() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty destUidValue() { return configSpace.pack(__FUNCTION__sr); } Future> getLatestRestorableVersion(Reference tr) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -940,7 +909,7 @@ public: }); } - KeyBackedProperty> backupRanges() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty> backupRanges() { return configSpace.pack(__FUNCTION__sr); } void startMutationLogs(Reference tr, KeyRangeRef backupRange, Key destUidValue) { Key mutationLogsDestKey = destUidValue.withPrefix(backupLogKeys.begin); @@ -1008,7 +977,8 @@ struct StringRefReader { namespace fileBackup { ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, - int len); + int len, + Optional cx); // Reads a mutation log block from file and parses into batch mutation blocks for further parsing. ACTOR Future>> decodeMutationLogFileBlock(Reference file, @@ -1030,5 +1000,41 @@ ACTOR Future transformRestoredDatabase(Database cx, void simulateBlobFailure(); +// Add the set of ranges that are backed up in a default backup to the given vector. This consists of all normal keys +// and the system backup ranges. +void addDefaultBackupRanges(Standalone>& backupKeys); + +// Return a vector containing the key ranges in system key-space that should be backed up in a default backup. +VectorRef const& getSystemBackupRanges(); + +// Return a key-range map that can be used to check whether a system key is a candidate backup key (i.e. whether it is +// part of any system backup ranges). +KeyRangeMap const& systemBackupMutationMask(); + +// Returns true if the given set of ranges exactly matches the set of ranges included in a default backup. +template +bool isDefaultBackup(Container ranges) { + std::unordered_set uniqueRanges(ranges.begin(), ranges.end()); + auto& systemBackupRanges = getSystemBackupRanges(); + + if (uniqueRanges.size() != systemBackupRanges.size() + 1) { + return false; + } + + if (!uniqueRanges.count(normalKeys)) { + return false; + } + for (auto range : getSystemBackupRanges()) { + if (!uniqueRanges.count(range)) { + return false; + } + } + + return true; +} + +// Returns a key-range used to denote that a shared mutation stream belongs to the default backup set. +KeyRangeRef const& getDefaultBackupSharedRange(); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/BackupContainer.h b/fdbclient/include/fdbclient/BackupContainer.h index 11d5c2ba27..95a1072a05 100644 --- a/fdbclient/include/fdbclient/BackupContainer.h +++ b/fdbclient/include/fdbclient/BackupContainer.h @@ -67,6 +67,9 @@ static const uint32_t PARTITIONED_MLOG_VERSION = 4110; // Snapshot file version written by FileBackupAgent static const uint32_t BACKUP_AGENT_SNAPSHOT_FILE_VERSION = 1001; +// Encrypted Snapshot file version written by FileBackupAgent +static const uint32_t BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION = 1002; + struct LogFile { Version beginVersion; Version endVersion; @@ -250,7 +253,7 @@ public: // Returns the key ranges in the snapshot file. This is an expensive function // and should only be used in simulation for sanity check. - virtual Future getSnapshotFileKeyRange(const RangeFile& file) = 0; + virtual Future getSnapshotFileKeyRange(const RangeFile& file, Optional cx) = 0; struct ExpireProgress { std::string step; @@ -289,6 +292,7 @@ public: // If logsOnly is set, only use log files in [beginVersion, targetVervions) in restore set. // Returns non-present if restoring to the given version is not possible. virtual Future> getRestoreSet(Version targetVersion, + Optional cx, VectorRef keyRangesFilter = {}, bool logsOnly = false, Version beginVersion = -1) = 0; diff --git a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h index 77285ced16..ed79a56078 100644 --- a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h +++ b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h @@ -25,8 +25,6 @@ #include "fdbclient/AsyncTaskThread.h" #include "fdbclient/BackupContainerFileSystem.h" -#include "storage_credential.h" -#include "storage_account.h" #include "blob/blob_client.h" class BackupContainerAzureBlobStore final : public BackupContainerFileSystem, diff --git a/fdbclient/include/fdbclient/BackupContainerFileSystem.h b/fdbclient/include/fdbclient/BackupContainerFileSystem.h index 17245c9c39..c6819a864e 100644 --- a/fdbclient/include/fdbclient/BackupContainerFileSystem.h +++ b/fdbclient/include/fdbclient/BackupContainerFileSystem.h @@ -152,9 +152,10 @@ public: ExpireProgress* progress, Version restorableBeginVersion) final; - Future getSnapshotFileKeyRange(const RangeFile& file) final; + Future getSnapshotFileKeyRange(const RangeFile& file, Optional cx) final; Future> getRestoreSet(Version targetVersion, + Optional cx, VectorRef keyRangesFilter, bool logsOnly, Version beginVersion) final; diff --git a/flow/include/flow/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h similarity index 77% rename from flow/include/flow/BlobCipher.h rename to fdbclient/include/fdbclient/BlobCipher.h index 466ebba1b0..be2eae72b8 100644 --- a/flow/include/flow/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -17,23 +17,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef FLOW_BLOB_CIPHER_H -#define FLOW_BLOB_CIPHER_H -#include "flow/ProtocolVersion.h" -#include "flow/serialize.h" +#ifndef FDBCLIENT_BLOB_CIPHER_H +#define FDBCLIENT_BLOB_CIPHER_H #pragma once +#include "fdbrpc/Stats.h" #include "flow/Arena.h" #include "flow/EncryptUtils.h" #include "flow/FastRef.h" #include "flow/flow.h" #include "flow/genericactors.actor.h" +#include "flow/Knobs.h" #include "flow/network.h" +#include "flow/Platform.h" +#include "flow/ProtocolVersion.h" +#include "flow/serialize.h" #include #include +#include #include #include +#include #include #include #include @@ -48,6 +53,59 @@ #define AES_256_KEY_LENGTH 32 #define AES_256_IV_LENGTH 16 +class BlobCipherMetrics : public NonCopyable { +public: + static BlobCipherMetrics* getInstance() { + static BlobCipherMetrics* instance = nullptr; + if (instance == nullptr) { + instance = new BlobCipherMetrics; + } + return instance; + } + + // Order of this enum has to match initializer of counterSets. + enum UsageType : int { + TLOG = 0, + KV_MEMORY, + KV_REDWOOD, + BLOB_GRANULE, + BACKUP, + TEST, + MAX, + }; + + struct CounterSet { + Counter encryptCPUTimeNS; + Counter decryptCPUTimeNS; + LatencySample getCipherKeysLatency; + LatencySample getLatestCipherKeysLatency; + + CounterSet(CounterCollection& cc, std::string name); + }; + + static CounterSet& counters(UsageType t) { + ASSERT(t < UsageType::MAX); + return getInstance()->counterSets[int(t)]; + } + +private: + BlobCipherMetrics(); + + CounterCollection cc; + Future traceFuture; + +public: + Counter cipherKeyCacheHit; + Counter cipherKeyCacheMiss; + Counter cipherKeyCacheExpired; + Counter latestCipherKeyCacheHit; + Counter latestCipherKeyCacheMiss; + Counter latestCipherKeyCacheNeedsRefresh; + LatencySample getCipherKeysLatency; + LatencySample getLatestCipherKeysLatency; + std::array counterSets; +}; + // Encryption operations buffer management // Approach limits number of copies needed during encryption or decryption operations. // For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture @@ -59,7 +117,7 @@ class EncryptBuf : public ReferenceCounted, NonCopyable { public: EncryptBuf(int size, Arena& arena) : allocSize(size), logicalSize(size) { if (size > 0) { - buffer = new (arena) uint8_t[size]; + buffer = new (arena) uint8_t[size](); } else { buffer = nullptr; } @@ -83,9 +141,9 @@ private: #pragma pack(push, 1) // exact fit - no padding struct BlobCipherDetails { // Encryption domain boundary identifier. - EncryptCipherDomainId encryptDomainId = ENCRYPT_INVALID_DOMAIN_ID; + EncryptCipherDomainId encryptDomainId = INVALID_ENCRYPT_DOMAIN_ID; // BaseCipher encryption key identifier - EncryptCipherBaseKeyId baseCipherId = ENCRYPT_INVALID_CIPHER_KEY_ID; + EncryptCipherBaseKeyId baseCipherId = INVALID_ENCRYPT_CIPHER_KEY_ID; // Random salt EncryptCipherRandomSalt salt{}; @@ -135,7 +193,8 @@ typedef struct BlobCipherEncryptHeader { uint8_t headerVersion{}; uint8_t encryptMode{}; uint8_t authTokenMode{}; - uint8_t _reserved[4]{}; + uint8_t authTokenAlgo{}; + uint8_t _reserved[3]{}; } flags; uint64_t _padding{}; }; @@ -166,12 +225,12 @@ typedef struct BlobCipherEncryptHeader { struct { // Cipher text authentication token - uint8_t cipherTextAuthToken[AUTH_TOKEN_SIZE]{}; - uint8_t headerAuthToken[AUTH_TOKEN_SIZE]{}; + uint8_t cipherTextAuthToken[AUTH_TOKEN_MAX_SIZE]{}; + uint8_t headerAuthToken[AUTH_TOKEN_MAX_SIZE]{}; } multiAuthTokens; struct { - uint8_t authToken[AUTH_TOKEN_SIZE]{}; - uint8_t _reserved[AUTH_TOKEN_SIZE]{}; + uint8_t authToken[AUTH_TOKEN_MAX_SIZE]{}; + uint8_t _reserved[AUTH_TOKEN_MAX_SIZE]{}; } singleAuthToken; }; @@ -216,15 +275,20 @@ public: BlobCipherKey(const EncryptCipherDomainId& domainId, const EncryptCipherBaseKeyId& baseCiphId, const uint8_t* baseCiph, - int baseCiphLen); + int baseCiphLen, + const int64_t refreshAt, + int64_t expireAt); BlobCipherKey(const EncryptCipherDomainId& domainId, const EncryptCipherBaseKeyId& baseCiphId, const uint8_t* baseCiph, int baseCiphLen, - const EncryptCipherRandomSalt& salt); + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt); uint8_t* data() const { return cipher.get(); } - uint64_t getCreationTime() const { return creationTime; } + uint64_t getRefreshAtTS() const { return refreshAtTS; } + uint64_t getExpireAtTS() const { return expireAtTS; } EncryptCipherDomainId getDomainId() const { return encryptDomainId; } EncryptCipherRandomSalt getSalt() const { return randomSalt; } EncryptCipherBaseKeyId getBaseCipherId() const { return baseCipherId; } @@ -243,6 +307,20 @@ public: randomSalt == details.salt; } + inline bool needsRefresh() { + if (refreshAtTS == std::numeric_limits::max()) { + return false; + } + return now() >= refreshAtTS ? true : false; + } + + inline bool isExpired() { + if (expireAtTS == std::numeric_limits::max()) { + return false; + } + return now() >= expireAtTS ? true : false; + } + void reset(); private: @@ -254,16 +332,20 @@ private: EncryptCipherBaseKeyId baseCipherId; // Random salt used for encryption cipher key derivation EncryptCipherRandomSalt randomSalt; - // Creation timestamp for the derived encryption cipher key - uint64_t creationTime; // Derived encryption cipher key std::unique_ptr cipher; + // CipherKey needs refreshAt + int64_t refreshAtTS; + // CipherKey is valid until + int64_t expireAtTS; void initKey(const EncryptCipherDomainId& domainId, const uint8_t* baseCiph, int baseCiphLen, const EncryptCipherBaseKeyId& baseCiphId, - const EncryptCipherRandomSalt& salt); + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt); void applyHmacSha256Derivation(); }; @@ -299,8 +381,7 @@ using BlobCipherKeyIdCacheMapCItr = struct BlobCipherKeyIdCache : ReferenceCounted { public: - BlobCipherKeyIdCache(); - explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId); + explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId, size_t* sizeStat); BlobCipherKeyIdCacheKey getCacheKey(const EncryptCipherBaseKeyId& baseCipherId, const EncryptCipherRandomSalt& salt); @@ -326,7 +407,9 @@ public: Reference insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, - int baseCipherLen); + int baseCipherLen, + const int64_t refreshAt, + const int64_t expireAt); // API enables inserting base encryption cipher details to the BlobCipherKeyIdCache // Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey @@ -341,7 +424,9 @@ public: Reference insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, int baseCipherLen, - const EncryptCipherRandomSalt& salt); + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt); // API cleanup the cache by dropping all cached cipherKeys void cleanup(); @@ -349,11 +434,15 @@ public: // API returns list of all 'cached' cipherKeys std::vector> getAllCipherKeys(); + // Return number of cipher keys in the cahce. + size_t getSize() const { return keyIdCache.size(); } + private: EncryptCipherDomainId domainId; BlobCipherKeyIdCacheMap keyIdCache; Optional latestBaseCipherKeyId; Optional latestRandomSalt; + size_t* sizeStat; // pointer to the outer BlobCipherKeyCache size count. }; using BlobCipherDomainCacheMap = std::unordered_map>; @@ -377,7 +466,9 @@ public: Reference insertCipherKey(const EncryptCipherDomainId& domainId, const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, - int baseCipherLen); + int baseCipherLen, + const int64_t refreshAt, + const int64_t expireAt); // Enable clients to insert base encryption cipher details to the BlobCipherKeyCache. // The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable, @@ -394,7 +485,9 @@ public: const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, int baseCipherLen, - const EncryptCipherRandomSalt& salt); + const EncryptCipherRandomSalt& salt, + const int64_t refreshAt, + const int64_t expireAt); // API returns the last insert cipherKey for a given encryption domain Id. // If domain Id is invalid, it would throw 'encrypt_invalid_id' exception, @@ -414,10 +507,19 @@ public: // API enables dropping all 'cached' cipherKeys for a given encryption domain Id. // Useful to cleanup cache if an encryption domain gets removed/destroyed etc. - void resetEncryptDomainId(const EncryptCipherDomainId domainId); + // Total number of cipher keys in the cache. + size_t getSize() const { return size; } + static Reference getInstance() { + static bool cleanupRegistered = false; + if (!cleanupRegistered) { + // We try to avoid cipher keys appear in core dumps, so we clean them up before crash. + // TODO(yiwu): use of MADV_DONTDUMP instead of the crash handler. + registerCrashHandlerCallback(BlobCipherKeyCache::cleanup); + cleanupRegistered = true; + } if (g_network->isSimulated()) { return FlowSingleton::getInstance( []() { return makeReference(g_network->isSimulated()); }); @@ -433,6 +535,7 @@ public: private: BlobCipherDomainCacheMap domainCacheMap; + size_t size = 0; BlobCipherKeyCache() {} }; @@ -450,17 +553,30 @@ public: Reference hCipherKey, const uint8_t* iv, const int ivLen, - const EncryptAuthTokenMode mode); + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType); EncryptBlobCipherAes265Ctr(Reference tCipherKey, Reference hCipherKey, - const EncryptAuthTokenMode mode); + const uint8_t* iv, + const int ivLen, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType); + EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + BlobCipherMetrics::UsageType usageType); + EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const EncryptAuthTokenMode mode, + const EncryptAuthTokenAlgo algo, + BlobCipherMetrics::UsageType usageType); ~EncryptBlobCipherAes265Ctr(); Reference encrypt(const uint8_t* plaintext, const int plaintextLen, BlobCipherEncryptHeader* header, Arena&); - Standalone encryptBlobGranuleChunk(const uint8_t* plaintext, const int plaintextLen); private: EVP_CIPHER_CTX* ctx; @@ -468,6 +584,8 @@ private: Reference headerCipherKey; EncryptAuthTokenMode authTokenMode; uint8_t iv[AES_256_IV_LENGTH]; + BlobCipherMetrics::UsageType usageType; + EncryptAuthTokenAlgo authTokenAlgo; void init(); }; @@ -479,7 +597,8 @@ class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted tCipherKey, Reference hCipherKey, - const uint8_t* iv); + const uint8_t* iv, + BlobCipherMetrics::UsageType usageType); ~DecryptBlobCipherAes256Ctr(); Reference decrypt(const uint8_t* ciphertext, @@ -498,22 +617,20 @@ private: Reference headerCipherKey; bool headerAuthTokenValidationDone; bool authTokensValidationDone; + BlobCipherMetrics::UsageType usageType; void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header); void verifyAuthTokens(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena); void verifyHeaderSingleAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena); void verifyHeaderMultiAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, - uint8_t* buff, Arena& arena); }; @@ -522,16 +639,32 @@ public: HmacSha256DigestGen(const unsigned char* key, size_t len); ~HmacSha256DigestGen(); HMAC_CTX* getCtx() const { return ctx; } - StringRef digest(unsigned char const* data, size_t len, Arena&); + unsigned int digest(const std::vector>& payload, + unsigned char* buf, + unsigned int bufLen); private: HMAC_CTX* ctx; }; -StringRef computeAuthToken(const uint8_t* payload, - const int payloadLen, - const uint8_t* key, - const int keyLen, - Arena& arena); +class Aes256CmacDigestGen final : NonCopyable { +public: + Aes256CmacDigestGen(const unsigned char* key, size_t len); + ~Aes256CmacDigestGen(); + CMAC_CTX* getCtx() const { return ctx; } + size_t digest(const std::vector>& payload, uint8_t* digest, int digestlen); -#endif // FLOW_BLOB_CIPHER_H \ No newline at end of file +private: + CMAC_CTX* ctx; +}; + +void computeAuthToken(const std::vector>& payload, + const uint8_t* key, + const int keyLen, + unsigned char* digestBuf, + const EncryptAuthTokenAlgo algo, + unsigned int digestMaxBufSz); + +EncryptAuthTokenMode getEncryptAuthTokenMode(const EncryptAuthTokenMode mode); + +#endif // FDBCLIENT_BLOB_CIPHER_H \ No newline at end of file diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index 4561779794..6f530f020d 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -22,10 +22,10 @@ #define FDBCLIENT_BLOBGRANULECOMMON_H #pragma once +#include "fdbclient/BlobCipher.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" -#include "flow/BlobCipher.h" #include "flow/EncryptUtils.h" #include "flow/IRandom.h" #include "flow/serialize.h" @@ -35,7 +35,6 @@ #define BG_ENCRYPT_COMPRESS_DEBUG false // file format of actual blob files -// FIXME: use VecSerStrategy::String serialization for this struct GranuleSnapshot : VectorRef { constexpr static FileIdentifier file_identifier = 1300395; @@ -46,6 +45,7 @@ struct GranuleSnapshot : VectorRef { } }; +// Deltas in version order struct GranuleDeltas : VectorRef { constexpr static FileIdentifier file_identifier = 8563013; @@ -55,6 +55,13 @@ struct GranuleDeltas : VectorRef { } }; +struct GranuleMaterializeStats { + int64_t inputBytes; + int64_t outputBytes; + + GranuleMaterializeStats() : inputBytes(0), outputBytes(0) {} +}; + struct BlobGranuleCipherKeysMeta { EncryptCipherDomainId textDomainId; EncryptCipherBaseKeyId textBaseCipherId; @@ -142,16 +149,11 @@ struct BlobGranuleCipherKeysMetaRef { StringRef ivRef; BlobGranuleCipherKeysMetaRef() {} - BlobGranuleCipherKeysMetaRef(Arena& to, - const EncryptCipherDomainId tDomainId, - const EncryptCipherBaseKeyId tBaseCipherId, - const EncryptCipherRandomSalt tSalt, - const EncryptCipherDomainId hDomainId, - const EncryptCipherBaseKeyId hBaseCipherId, - const EncryptCipherRandomSalt hSalt, - const std::string& ivStr) - : textDomainId(tDomainId), textBaseCipherId(tBaseCipherId), textSalt(tSalt), headerDomainId(hDomainId), - headerBaseCipherId(hBaseCipherId), headerSalt(hSalt), ivRef(StringRef(to, ivStr)) {} + BlobGranuleCipherKeysMetaRef(Arena& to, BlobGranuleCipherKeysMeta cipherKeysMeta) + : textDomainId(cipherKeysMeta.textDomainId), textBaseCipherId(cipherKeysMeta.textBaseCipherId), + textSalt(cipherKeysMeta.textSalt), headerDomainId(cipherKeysMeta.headerDomainId), + headerBaseCipherId(cipherKeysMeta.headerBaseCipherId), headerSalt(cipherKeysMeta.headerSalt), + ivRef(StringRef(to, cipherKeysMeta.ivStr)) {} template void serialize(Ar& ar) { @@ -161,16 +163,31 @@ struct BlobGranuleCipherKeysMetaRef { struct BlobFilePointerRef { constexpr static FileIdentifier file_identifier = 5253554; + // Serializable fields StringRef filename; int64_t offset; int64_t length; int64_t fullFileLength; - Optional cipherKeysMetaRef; + Optional cipherKeysCtx; + + // Non-serializable fields + Optional + cipherKeysMetaRef; // Placeholder to cache information sufficient to lookup encryption ciphers BlobFilePointerRef() {} + BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length, int64_t fullFileLength) : filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {} + BlobFilePointerRef(Arena& to, + const std::string& filename, + int64_t offset, + int64_t length, + int64_t fullFileLength, + Optional ciphKeysCtx) + : filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength), + cipherKeysCtx(ciphKeysCtx) {} + BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, @@ -179,30 +196,23 @@ struct BlobFilePointerRef { Optional ciphKeysMeta) : filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) { if (ciphKeysMeta.present()) { - cipherKeysMetaRef = BlobGranuleCipherKeysMetaRef(to, - ciphKeysMeta.get().textDomainId, - ciphKeysMeta.get().textBaseCipherId, - ciphKeysMeta.get().textSalt, - ciphKeysMeta.get().headerDomainId, - ciphKeysMeta.get().headerBaseCipherId, - ciphKeysMeta.get().headerSalt, - ciphKeysMeta.get().ivStr); + cipherKeysMetaRef = BlobGranuleCipherKeysMetaRef(to, ciphKeysMeta.get()); } } template void serialize(Ar& ar) { - serializer(ar, filename, offset, length, fullFileLength, cipherKeysMetaRef); + serializer(ar, filename, offset, length, fullFileLength, cipherKeysCtx); } std::string toString() const { std::stringstream ss; ss << filename.toString() << ":" << offset << ":" << length << ":" << fullFileLength; - if (cipherKeysMetaRef.present()) { - ss << ":CipherKeysMeta:TextCipher:" << cipherKeysMetaRef.get().textDomainId << ":" - << cipherKeysMetaRef.get().textBaseCipherId << ":" << cipherKeysMetaRef.get().textSalt - << ":HeaderCipher:" << cipherKeysMetaRef.get().headerDomainId << ":" - << cipherKeysMetaRef.get().headerBaseCipherId << ":" << cipherKeysMetaRef.get().headerSalt; + if (cipherKeysCtx.present()) { + ss << ":CipherKeysCtx:TextCipher:" << cipherKeysCtx.get().textCipherKey.encryptDomainId << ":" + << cipherKeysCtx.get().textCipherKey.baseCipherId << ":" << cipherKeysCtx.get().textCipherKey.salt + << ":HeaderCipher:" << cipherKeysCtx.get().headerCipherKey.encryptDomainId << ":" + << cipherKeysCtx.get().headerCipherKey.baseCipherId << ":" << cipherKeysCtx.get().headerCipherKey.salt; } return std::move(ss).str(); } @@ -223,32 +233,75 @@ struct BlobGranuleChunkRef { VectorRef deltaFiles; GranuleDeltas newDeltas; Optional tenantPrefix; - Optional cipherKeysCtx; template void serialize(Ar& ar) { - serializer(ar, - keyRange, - includedVersion, - snapshotVersion, - snapshotFile, - deltaFiles, - newDeltas, - tenantPrefix, - cipherKeysCtx); + serializer(ar, keyRange, includedVersion, snapshotVersion, snapshotFile, deltaFiles, newDeltas, tenantPrefix); } }; +struct BlobGranuleSummaryRef { + constexpr static FileIdentifier file_identifier = 9774587; + KeyRangeRef keyRange; + Version snapshotVersion; + int64_t snapshotSize; + Version deltaVersion; + int64_t deltaSize; + + template + void serialize(Ar& ar) { + serializer(ar, keyRange, snapshotVersion, snapshotSize, deltaVersion, deltaSize); + } +}; + +BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk); + enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done = 3 }; +// Boundary metadata for each range indexed by the beginning of the range. +struct BlobGranuleMergeBoundary { + constexpr static FileIdentifier file_identifier = 557861; + + // Hard boundaries represent backing regions we want to keep separate. + bool buddy; + + template + void serialize(Ar& ar) { + serializer(ar, buddy); + } +}; + struct BlobGranuleHistoryValue { constexpr static FileIdentifier file_identifier = 991434; UID granuleID; - VectorRef> parentGranules; + VectorRef parentBoundaries; + VectorRef parentVersions; template void serialize(Ar& ar) { - serializer(ar, granuleID, parentGranules); + serializer(ar, granuleID, parentBoundaries, parentVersions); + } +}; + +struct GranuleHistory { + KeyRange range; + Version version; + Standalone value; + + GranuleHistory() {} + + GranuleHistory(KeyRange range, Version version, Standalone value) + : range(range), version(version), value(value) {} +}; + +// A manifest to assist full fdb restore from blob granule files +struct BlobManifest { + constexpr static FileIdentifier file_identifier = 298872; + VectorRef rows; + + template + void serialize(Ar& ar) { + serializer(ar, rows); } }; diff --git a/fdbclient/include/fdbclient/BlobGranuleFiles.h b/fdbclient/include/fdbclient/BlobGranuleFiles.h index 7ee7a62bd4..23faff3d03 100644 --- a/fdbclient/include/fdbclient/BlobGranuleFiles.h +++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h @@ -26,18 +26,25 @@ #include "fdbclient/BlobGranuleCommon.h" #include "flow/CompressionUtils.h" -Value serializeChunkedSnapshot(Standalone snapshot, - int chunks, +Value serializeChunkedSnapshot(const Standalone& fileNameRef, + const Standalone& snapshot, + int chunkSize, Optional compressFilter, - Optional cipherKeysCtx = Optional()); + Optional cipherKeysCtx = {}); -// FIXME: support sorted and chunked delta files +Value serializeChunkedDeltaFile(const Standalone& fileNameRef, + const Standalone& deltas, + const KeyRangeRef& fileRange, + int chunkSize, + Optional compressFilter, + Optional cipherKeysCtx = {}); ErrorOr loadAndMaterializeBlobGranules(const Standalone>& files, const KeyRangeRef& keyRange, Version beginVersion, Version readVersion, - ReadBlobGranuleContext granuleContext); + ReadBlobGranuleContext granuleContext, + GranuleMaterializeStats& stats); RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, KeyRangeRef keyRange, diff --git a/fdbclient/include/fdbclient/BlobGranuleReader.actor.h b/fdbclient/include/fdbclient/BlobGranuleReader.actor.h index a9008b03eb..395b76b26c 100644 --- a/fdbclient/include/fdbclient/BlobGranuleReader.actor.h +++ b/fdbclient/include/fdbclient/BlobGranuleReader.actor.h @@ -51,5 +51,7 @@ ACTOR Future readBlobGranules(BlobGranuleFileRequest request, Reference bstore, PromiseStream results); +bool isRangeFullyCovered(KeyRange range, Standalone> blobChunks); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h index 9912d4d2c7..9539db459b 100644 --- a/fdbclient/include/fdbclient/BlobWorkerCommon.h +++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h @@ -30,7 +30,7 @@ struct BlobWorkerStats { Counter deltaBytesWritten, snapshotBytesWritten; Counter bytesReadFromFDBForInitialSnapshot; Counter bytesReadFromS3ForCompaction; - Counter rangeAssignmentRequests, readRequests; + Counter rangeAssignmentRequests, readRequests, summaryReads; Counter wrongShardServer; Counter changeFeedInputBytes; Counter readReqTotalFilesReturned; @@ -41,16 +41,33 @@ struct BlobWorkerStats { Counter readRequestsWithBegin; Counter readRequestsCollapsed; Counter flushGranuleReqs; + Counter compressionBytesRaw; + Counter compressionBytesFinal; + Counter fullRejections; + Counter forceFlushCleanups; int numRangesAssigned; int mutationBytesBuffered; int activeReadRequests; int granulesPendingSplitCheck; + Version minimumCFVersion; + Version cfVersionLag; + int notAtLatestChangeFeeds; + int64_t lastResidentMemory; + int64_t estimatedMaxResidentMemory; + + Reference initialSnapshotLock; + Reference resnapshotLock; + Reference deltaWritesLock; Future logger; // Current stats maintained for a given blob worker process - explicit BlobWorkerStats(UID id, double interval) + explicit BlobWorkerStats(UID id, + double interval, + Reference initialSnapshotLock, + Reference resnapshotLock, + Reference deltaWritesLock) : cc("BlobWorkerStats", id.toString()), s3PutReqs("S3PutReqs", cc), s3GetReqs("S3GetReqs", cc), s3DeleteReqs("S3DeleteReqs", cc), @@ -59,17 +76,32 @@ struct BlobWorkerStats { bytesReadFromFDBForInitialSnapshot("BytesReadFromFDBForInitialSnapshot", cc), bytesReadFromS3ForCompaction("BytesReadFromS3ForCompaction", cc), rangeAssignmentRequests("RangeAssignmentRequests", cc), readRequests("ReadRequests", cc), - wrongShardServer("WrongShardServer", cc), changeFeedInputBytes("RangeFeedInputBytes", cc), - readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), + summaryReads("SummaryReads", cc), wrongShardServer("WrongShardServer", cc), + changeFeedInputBytes("ChangeFeedInputBytes", cc), readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc), granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc), readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc), - flushGranuleReqs("FlushGranuleReqs", cc), numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), - granulesPendingSplitCheck(0) { + flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc), + compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), + forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0), + activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0), + notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0), + initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) { specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; }); specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; }); specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; }); specialCounter(cc, "GranulesPendingSplitCheck", [this]() { return this->granulesPendingSplitCheck; }); + specialCounter(cc, "MinimumChangeFeedVersion", [this]() { return this->minimumCFVersion; }); + specialCounter(cc, "CFVersionLag", [this]() { return this->cfVersionLag; }); + specialCounter(cc, "NotAtLatestChangeFeeds", [this]() { return this->notAtLatestChangeFeeds; }); + specialCounter(cc, "LastResidentMemory", [this]() { return this->lastResidentMemory; }); + specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; }); + specialCounter(cc, "InitialSnapshotsActive", [this]() { return this->initialSnapshotLock->activePermits(); }); + specialCounter(cc, "InitialSnapshotsWaiting", [this]() { return this->initialSnapshotLock->waiters(); }); + specialCounter(cc, "ReSnapshotsActive", [this]() { return this->resnapshotLock->activePermits(); }); + specialCounter(cc, "ReSnapshotsWaiting", [this]() { return this->resnapshotLock->waiters(); }); + specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); }); + specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); }); logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics"); } diff --git a/fdbclient/include/fdbclient/BlobWorkerInterface.h b/fdbclient/include/fdbclient/BlobWorkerInterface.h index de370d248f..69d938300e 100644 --- a/fdbclient/include/fdbclient/BlobWorkerInterface.h +++ b/fdbclient/include/fdbclient/BlobWorkerInterface.h @@ -30,15 +30,15 @@ struct BlobWorkerInterface { constexpr static FileIdentifier file_identifier = 8358753; - // TODO: mimic what StorageServerInterface does with sequential endpoint IDs RequestStream> waitFailure; - RequestStream blobGranuleFileRequest; + PublicRequestStream blobGranuleFileRequest; RequestStream assignBlobRangeRequest; RequestStream revokeBlobRangeRequest; RequestStream granuleAssignmentsRequest; RequestStream granuleStatusStreamRequest; RequestStream haltBlobWorker; RequestStream flushGranuleRequest; + RequestStream minBlobVersionRequest; struct LocalityData locality; UID myId; @@ -57,6 +57,7 @@ struct BlobWorkerInterface { streams.push_back(granuleStatusStreamRequest.getReceiver()); streams.push_back(haltBlobWorker.getReceiver()); streams.push_back(flushGranuleRequest.getReceiver()); + streams.push_back(minBlobVersionRequest.getReceiver()); FlowTransport::transport().addEndpoints(streams); } UID id() const { return myId; } @@ -72,7 +73,7 @@ struct BlobWorkerInterface { serializer(ar, myId, locality, waitFailure); if (Archive::isDeserializing) { blobGranuleFileRequest = - RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); + PublicRequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); assignBlobRangeRequest = RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(2)); revokeBlobRangeRequest = @@ -85,6 +86,8 @@ struct BlobWorkerInterface { RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(6)); flushGranuleRequest = RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(7)); + minBlobVersionRequest = + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(8)); } } }; @@ -110,13 +113,16 @@ struct BlobGranuleFileRequest { Version readVersion; bool canCollapseBegin = true; TenantInfo tenantInfo; + bool summarize = false; ReplyPromise reply; BlobGranuleFileRequest() {} + bool verify() const { return tenantInfo.isAuthorized(); } + template void serialize(Ar& ar) { - serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, reply, arena); + serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, summarize, reply, arena); } }; @@ -137,6 +143,28 @@ struct RevokeBlobRangeRequest { } }; +struct MinBlobVersionReply { + constexpr static FileIdentifier file_identifier = 6857512; + Version version; + + template + void serialize(Ar& ar) { + serializer(ar, version); + } +}; + +struct MinBlobVersionRequest { + constexpr static FileIdentifier file_identifier = 4833278; + Version grv; + ReplyPromise reply; + + MinBlobVersionRequest() {} + + template + void serialize(Ar& ar) { + serializer(ar, grv, reply); + } +}; /* * Continue: Blob worker should continue handling a granule that was evaluated for a split * Normal: Blob worker should open the granule and start processing it @@ -172,6 +200,7 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { KeyRange granuleRange; bool doSplit; bool writeHotSplit; + bool initialSplitTooBig; int64_t continueEpoch; int64_t continueSeqno; UID granuleID; @@ -180,11 +209,13 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { bool mergeCandidate; int64_t originalEpoch; int64_t originalSeqno; + Optional proposedSplitKey; GranuleStatusReply() {} explicit GranuleStatusReply(KeyRange range, bool doSplit, bool writeHotSplit, + bool initialSplitTooBig, int64_t continueEpoch, int64_t continueSeqno, UID granuleID, @@ -193,11 +224,15 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { bool mergeCandidate, int64_t originalEpoch, int64_t originalSeqno) - : granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), continueEpoch(continueEpoch), - continueSeqno(continueSeqno), granuleID(granuleID), startVersion(startVersion), blockedVersion(blockedVersion), - mergeCandidate(mergeCandidate), originalEpoch(originalEpoch), originalSeqno(originalSeqno) {} + : granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), initialSplitTooBig(initialSplitTooBig), + continueEpoch(continueEpoch), continueSeqno(continueSeqno), granuleID(granuleID), startVersion(startVersion), + blockedVersion(blockedVersion), mergeCandidate(mergeCandidate), originalEpoch(originalEpoch), + originalSeqno(originalSeqno) {} - int expectedSize() const { return sizeof(GranuleStatusReply) + granuleRange.expectedSize(); } + int expectedSize() const { + return sizeof(GranuleStatusReply) + granuleRange.expectedSize() + + (proposedSplitKey.present() ? proposedSplitKey.get().expectedSize() : 0); + } template void serialize(Ar& ar) { @@ -207,6 +242,7 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { granuleRange, doSplit, writeHotSplit, + initialSplitTooBig, continueEpoch, continueSeqno, granuleID, @@ -214,7 +250,8 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { blockedVersion, mergeCandidate, originalEpoch, - originalSeqno); + originalSeqno, + proposedSplitKey); } }; diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 6e60b56cbe..61f0359539 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -39,10 +39,6 @@ public: double FAILURE_MAX_DELAY; double FAILURE_MIN_DELAY; - double FAILURE_TIMEOUT_DELAY; - double CLIENT_FAILURE_TIMEOUT_DELAY; - double FAILURE_EMERGENCY_DELAY; - double FAILURE_MAX_GENERATIONS; double RECOVERY_DELAY_START_GENERATION; double RECOVERY_DELAY_SECONDS_PER_GENERATION; double MAX_GENERATIONS; @@ -61,6 +57,7 @@ public: double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is // mostly wrong (e.g. dumping the database after a test) double FUTURE_VERSION_RETRY_DELAY; + double GRV_ERROR_RETRY_DELAY; double UNKNOWN_TENANT_RETRY_DELAY; int REPLY_BYTE_LIMIT; double DEFAULT_BACKOFF; @@ -81,7 +78,7 @@ public: int64_t CHANGE_FEED_CACHE_SIZE; double CHANGE_FEED_POP_TIMEOUT; int64_t CHANGE_FEED_STREAM_MIN_BYTES; - int64_t TENANT_PREFIX_SIZE_LIMIT; + double CHANGE_FEED_START_INTERVAL; int MAX_BATCH_SIZE; double GRV_BATCH_TIMEOUT; @@ -110,7 +107,6 @@ public: int RANGESTREAM_BUFFERED_FRAGMENTS_LIMIT; bool QUARANTINE_TSS_ON_MISMATCH; double CHANGE_FEED_EMPTY_BATCH_TIME; - bool SHARD_ENCODE_LOCATION_METADATA; // KeyRangeMap int KRM_GET_RANGE_LIMIT; @@ -163,10 +159,8 @@ public: double BACKUP_AGGREGATE_POLL_RATE; double BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL; int BACKUP_LOG_WRITE_BATCH_MAX_SIZE; - int BACKUP_LOG_ATOMIC_OPS_SIZE; int BACKUP_MAX_LOG_RANGES; int BACKUP_SIM_COPY_LOG_RANGES; - int BACKUP_OPERATION_COST_OVERHEAD; int BACKUP_VERSION_DELAY; int BACKUP_MAP_KEY_LOWER_LIMIT; int BACKUP_MAP_KEY_UPPER_LIMIT; @@ -206,6 +200,10 @@ public: int32_t DEFAULT_AUTO_RESOLVERS; int32_t DEFAULT_AUTO_LOGS; + double GLOBAL_CONFIG_REFRESH_BACKOFF; + double GLOBAL_CONFIG_REFRESH_MAX_BACKOFF; + double GLOBAL_CONFIG_REFRESH_TIMEOUT; + // Dynamic Knobs double COMMIT_QUORUM_TIMEOUT; double GET_GENERATION_QUORUM_TIMEOUT; @@ -217,12 +215,7 @@ public: int64_t CSI_SIZE_LIMIT; double CSI_STATUS_DELAY; - int HTTP_SEND_SIZE; - int HTTP_READ_SIZE; - int HTTP_VERBOSE_LEVEL; - std::string HTTP_REQUEST_ID_HEADER; bool HTTP_REQUEST_AWS_V4_HEADER; // setting this knob to true will enable AWS V4 style header. - bool HTTP_RESPONSE_SKIP_VERIFY_CHECKSUM_FOR_PARTIAL_CONTENT; // skip verify md5 checksum for 206 response std::string BLOBSTORE_ENCRYPTION_TYPE; int BLOBSTORE_CONNECT_TRIES; int BLOBSTORE_CONNECT_TIMEOUT; @@ -261,23 +254,21 @@ public: int MAX_TRANSACTION_TAG_LENGTH; int MAX_TAGS_PER_TRANSACTION; int COMMIT_SAMPLE_COST; // The expectation of sampling is every COMMIT_SAMPLE_COST sample once - int WRITE_COST_BYTE_FACTOR; int INCOMPLETE_SHARD_PLUS; // The size of (possible) incomplete shard when estimate clear range double READ_TAG_SAMPLE_RATE; // Communicated to clients from cluster double TAG_THROTTLE_SMOOTHING_WINDOW; double TAG_THROTTLE_RECHECK_INTERVAL; double TAG_THROTTLE_EXPIRATION_INTERVAL; + int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations + int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations // busyness reporting double BUSYNESS_SPIKE_START_THRESHOLD; double BUSYNESS_SPIKE_SATURATED_THRESHOLD; - // multi-version client control - int MVC_CLIENTLIB_CHUNK_SIZE; - int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION; - // Blob Granules int BG_MAX_GRANULE_PARALLELISM; + int BG_TOO_MANY_GRANULES; // The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file. // This might happen when a recovery is happening together with a cluster controller coordinator key change. @@ -286,6 +277,19 @@ public: int CHANGE_QUORUM_BAD_STATE_RETRY_TIMES; double CHANGE_QUORUM_BAD_STATE_RETRY_DELAY; + // Tenants and Metacluster + int MAX_TENANTS_PER_CLUSTER; + int TENANT_TOMBSTONE_CLEANUP_INTERVAL; + int MAX_DATA_CLUSTERS; + int REMOVE_CLUSTER_TENANT_BATCH_SIZE; + int METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK; + double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY; + double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT; + int TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantEntryCache is refreshed + + // Encryption-at-rest + bool ENABLE_ENCRYPTION_CPU_TIME_LOGGING; + ClientKnobs(Randomize randomize); void initialize(Randomize randomize); }; diff --git a/fdbclient/include/fdbclient/ClientVersion.h b/fdbclient/include/fdbclient/ClientVersion.h index fe3068affc..c395a69cc2 100644 --- a/fdbclient/include/fdbclient/ClientVersion.h +++ b/fdbclient/include/fdbclient/ClientVersion.h @@ -37,7 +37,7 @@ struct ClientVersionRef { ClientVersionRef(StringRef clientVersion, StringRef sourceVersion, StringRef protocolVersion) : clientVersion(clientVersion), sourceVersion(sourceVersion), protocolVersion(protocolVersion) {} ClientVersionRef(StringRef versionString) { - std::vector parts = versionString.splitAny(LiteralStringRef(",")); + std::vector parts = versionString.splitAny(","_sr); if (parts.size() != 3) { initUnknown(); return; @@ -48,9 +48,9 @@ struct ClientVersionRef { } void initUnknown() { - clientVersion = LiteralStringRef("Unknown"); - sourceVersion = LiteralStringRef("Unknown"); - protocolVersion = LiteralStringRef("Unknown"); + clientVersion = "Unknown"_sr; + sourceVersion = "Unknown"_sr; + protocolVersion = "Unknown"_sr; } template diff --git a/fdbclient/include/fdbclient/ClusterInterface.h b/fdbclient/include/fdbclient/ClusterInterface.h index 14935f1700..a4e3da44f3 100644 --- a/fdbclient/include/fdbclient/ClusterInterface.h +++ b/fdbclient/include/fdbclient/ClusterInterface.h @@ -98,32 +98,44 @@ struct ClusterControllerClientInterface { } }; -template -struct ItemWithExamples { - T item; - int count; - std::vector> examples; - - ItemWithExamples() : item{}, count(0) {} - ItemWithExamples(T const& item, int count, std::vector> const& examples) - : item(item), count(count), examples(examples) {} - - template - void serialize(Ar& ar) { - serializer(ar, item, count, examples); - } -}; - struct OpenDatabaseRequest { constexpr static FileIdentifier file_identifier = 2799502; // Sent by the native API to the cluster controller to open a database and track client // info changes. Returns immediately if the current client info id is different from // knownClientInfoID; otherwise returns when it next changes (or perhaps after a long interval) - int clientCount; - std::vector> issues; - std::vector>> supportedVersions; - std::vector> maxProtocolSupported; + struct Samples { + int count; + + // network address / trace log group + std::set> samples; + + Samples() : count(0), samples{} {} + + template + void serialize(Ar& ar) { + serializer(ar, count, samples); + } + + // Merges a set of Samples into *this + Samples& operator+=(const Samples& other) { + count += other.count; + samples.insert(std::begin(other.samples), std::end(other.samples)); + + return *this; + } + }; + + int clientCount = 0; + + // Maps issue to Samples + std::map issues; + + // Maps ClientVersionRef to Samples + std::map, Samples> supportedVersions; + + // Maps max protocol to Samples + std::map maxProtocolSupported; UID knownClientInfoID; ReplyPromise reply; diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 9a8095f808..1614aeacf0 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -25,16 +25,17 @@ #include #include -#include "fdbclient/FDBTypes.h" -#include "fdbclient/StorageServerInterface.h" #include "fdbclient/CommitTransaction.h" -#include "fdbclient/TagThrottle.actor.h" +#include "fdbclient/EncryptKeyProxyInterface.h" +#include "fdbclient/FDBTypes.h" #include "fdbclient/GlobalConfig.h" +#include "fdbclient/GrvProxyInterface.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/TagThrottle.actor.h" #include "fdbclient/VersionVector.h" #include "fdbrpc/Stats.h" #include "fdbrpc/TimedRequest.h" -#include "GrvProxyInterface.h" struct CommitProxyInterface { constexpr static FileIdentifier file_identifier = 8954922; @@ -117,8 +118,12 @@ struct ClientDBInfo { Optional forward; std::vector history; UID clusterId; + bool isEncryptionEnabled = false; + Optional encryptKeyProxy; TenantMode tenantMode; + ClusterType clusterType = ClusterType::STANDALONE; + Optional metaclusterName; ClientDBInfo() {} @@ -130,7 +135,18 @@ struct ClientDBInfo { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); } - serializer(ar, grvProxies, commitProxies, id, forward, history, tenantMode, clusterId); + serializer(ar, + grvProxies, + commitProxies, + id, + forward, + history, + tenantMode, + isEncryptionEnabled, + encryptKeyProxy, + clusterId, + clusterType, + metaclusterName); } }; @@ -176,10 +192,12 @@ struct CommitTransactionRequest : TimedRequest { CommitTransactionRequest() : CommitTransactionRequest(SpanContext()) {} CommitTransactionRequest(SpanContext const& context) : spanContext(context), flags(0) {} + bool verify() const { return tenantInfo.isAuthorized(); } + template void serialize(Ar& ar) { serializer( - ar, transaction, reply, arena, flags, debugID, commitCostEstimation, tagSet, spanContext, tenantInfo); + ar, transaction, reply, flags, debugID, commitCostEstimation, tagSet, spanContext, tenantInfo, arena); } }; @@ -281,6 +299,8 @@ struct GetReadVersionRequest : TimedRequest { } } + bool verify() const { return true; } + bool operator<(GetReadVersionRequest const& rhs) const { return priority < rhs.priority; } template @@ -319,7 +339,7 @@ struct GetKeyServerLocationsReply { template void serialize(Ar& ar) { - serializer(ar, results, resultsTssMapping, tenantEntry, arena, resultsTagMapping); + serializer(ar, results, resultsTssMapping, tenantEntry, resultsTagMapping, arena); } }; @@ -327,7 +347,7 @@ struct GetKeyServerLocationsRequest { constexpr static FileIdentifier file_identifier = 9144680; Arena arena; SpanContext spanContext; - Optional tenant; + TenantInfo tenant; KeyRef begin; Optional end; int limit; @@ -342,7 +362,7 @@ struct GetKeyServerLocationsRequest { GetKeyServerLocationsRequest() : limit(0), reverse(false), minTenantVersion(latestVersion) {} GetKeyServerLocationsRequest(SpanContext spanContext, - Optional const& tenant, + TenantInfo const& tenant, KeyRef const& begin, Optional const& end, int limit, @@ -352,6 +372,8 @@ struct GetKeyServerLocationsRequest { : arena(arena), spanContext(spanContext), tenant(tenant), begin(begin), end(end), limit(limit), reverse(reverse), minTenantVersion(minTenantVersion) {} + bool verify() const { return tenant.isAuthorized(); } + template void serialize(Ar& ar) { serializer(ar, begin, end, limit, reverse, reply, spanContext, tenant, minTenantVersion, arena); @@ -521,7 +543,7 @@ struct ProxySnapRequest { template void serialize(Ar& ar) { - serializer(ar, snapPayload, snapUID, reply, arena, debugID); + serializer(ar, snapPayload, snapUID, reply, debugID, arena); } }; @@ -552,4 +574,32 @@ struct ExclusionSafetyCheckRequest { } }; +struct GlobalConfigRefreshReply { + constexpr static FileIdentifier file_identifier = 12680327; + Arena arena; + RangeResultRef result; + + GlobalConfigRefreshReply() {} + GlobalConfigRefreshReply(Arena const& arena, RangeResultRef result) : arena(arena), result(result) {} + + template + void serialize(Ar& ar) { + serializer(ar, result, arena); + } +}; + +struct GlobalConfigRefreshRequest { + constexpr static FileIdentifier file_identifier = 2828131; + Version lastKnown; + ReplyPromise reply; + + GlobalConfigRefreshRequest() {} + explicit GlobalConfigRefreshRequest(Version lastKnown) : lastKnown(lastKnown) {} + + template + void serialize(Ar& ar) { + serializer(ar, lastKnown, reply); + } +}; + #endif diff --git a/fdbclient/include/fdbclient/CommitTransaction.h b/fdbclient/include/fdbclient/CommitTransaction.h index f6757ac17e..7bc080b130 100644 --- a/fdbclient/include/fdbclient/CommitTransaction.h +++ b/fdbclient/include/fdbclient/CommitTransaction.h @@ -22,9 +22,13 @@ #define FLOW_FDBCLIENT_COMMITTRANSACTION_H #pragma once +#include "fdbclient/BlobCipher.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/GetEncryptCipherKeys.actor.h" #include "fdbclient/Knobs.h" #include "fdbclient/Tracing.h" +#include "flow/EncryptUtils.h" +#include "flow/Knobs.h" // The versioned message has wire format : -1, version, messages static const int32_t VERSION_HEADER = -1; @@ -79,7 +83,7 @@ struct MutationRef { CompareAndClear, Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */, Reserved_For_OTELSpanContextMessage, - Reserved_For_EncryptedMutationMessage /* See fdbserver/EncryptedMutationMessage.actor.h */, + Encrypted, /* Represents an encrypted mutation and cannot be used directly before decrypting */ MAX_ATOMIC_OP }; // This is stored this way for serialization purposes. @@ -128,6 +132,80 @@ struct MutationRef { } } + // An encrypted mutation has type Encrypted, encryption header (which contains encryption metadata) as param1, + // and the payload as param2. It can be serialize/deserialize as normal mutation, but can only be used after + // decryption via decrypt(). + bool isEncrypted() const { return type == Encrypted; } + + const BlobCipherEncryptHeader* encryptionHeader() const { + ASSERT(isEncrypted()); + return reinterpret_cast(param1.begin()); + } + + MutationRef encrypt(const std::unordered_map>& cipherKeys, + const EncryptCipherDomainId& domainId, + Arena& arena, + BlobCipherMetrics::UsageType usageType) const { + ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID); + auto textCipherItr = cipherKeys.find(domainId); + auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + uint8_t iv[AES_256_IV_LENGTH] = { 0 }; + deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); + BinaryWriter bw(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); + bw << *this; + EncryptBlobCipherAes265Ctr cipher( + textCipherItr->second, + headerCipherItr->second, + iv, + AES_256_IV_LENGTH, + getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), + usageType); + BlobCipherEncryptHeader* header = new (arena) BlobCipherEncryptHeader; + StringRef headerRef(reinterpret_cast(header), sizeof(BlobCipherEncryptHeader)); + StringRef payload = + cipher.encrypt(static_cast(bw.getData()), bw.getLength(), header, arena)->toStringRef(); + return MutationRef(Encrypted, headerRef, payload); + } + + MutationRef encryptMetadata(const std::unordered_map>& cipherKeys, + Arena& arena, + BlobCipherMetrics::UsageType usageType) const { + return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena, usageType); + } + + MutationRef decrypt(TextAndHeaderCipherKeys cipherKeys, + Arena& arena, + BlobCipherMetrics::UsageType usageType, + StringRef* buf = nullptr) const { + const BlobCipherEncryptHeader* header = encryptionHeader(); + DecryptBlobCipherAes256Ctr cipher(cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header->iv, usageType); + StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef(); + if (buf != nullptr) { + *buf = plaintext; + } + ArenaReader reader(arena, plaintext, AssumeVersion(ProtocolVersion::withEncryptionAtRest())); + MutationRef mutation; + reader >> mutation; + return mutation; + } + + MutationRef decrypt(const std::unordered_map>& cipherKeys, + Arena& arena, + BlobCipherMetrics::UsageType usageType, + StringRef* buf = nullptr) const { + const BlobCipherEncryptHeader* header = encryptionHeader(); + auto textCipherItr = cipherKeys.find(header->cipherTextDetails); + auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + TextAndHeaderCipherKeys textAndHeaderKeys; + textAndHeaderKeys.cipherHeaderKey = headerCipherItr->second; + textAndHeaderKeys.cipherTextKey = textCipherItr->second; + return decrypt(textAndHeaderKeys, arena, usageType, buf); + } + // These masks define which mutation types have particular properties (they are used to implement // isSingleKeyMutation() etc) enum { @@ -190,6 +268,11 @@ struct CommitTransactionRef { VectorRef read_conflict_ranges; VectorRef write_conflict_ranges; VectorRef mutations; // metadata mutations + // encryptedMutations should be a 1-1 corespondence with mutations field above. That is either + // encryptedMutations.size() == 0 or encryptedMutations.size() == mutations.size() and encryptedMutations[i] = + // mutations[i].encrypt(). Currently this field is not serialized so clients should NOT set this field during a + // usual commit path. It is currently only used during backup mutation log restores. + VectorRef> encryptedMutations; Version read_snapshot = 0; bool report_conflicting_keys = false; bool lock_aware = false; // set when metadata mutations are present diff --git a/fdbclient/include/fdbclient/ConfigKnobs.h b/fdbclient/include/fdbclient/ConfigKnobs.h index 536bca16f3..168e2fed16 100644 --- a/fdbclient/include/fdbclient/ConfigKnobs.h +++ b/fdbclient/include/fdbclient/ConfigKnobs.h @@ -25,6 +25,8 @@ #include "fdbclient/FDBTypes.h" +typedef uint64_t CoordinatorsHash; + /* * KnobValueRefs are stored in the configuration database, and in local configuration files. They are created from * ParsedKnobValue objects, so it is assumed that the value type is correct for the corresponding knob name diff --git a/fdbclient/include/fdbclient/ConfigTransactionInterface.h b/fdbclient/include/fdbclient/ConfigTransactionInterface.h index 98b65e4c4b..dad60f2d04 100644 --- a/fdbclient/include/fdbclient/ConfigTransactionInterface.h +++ b/fdbclient/include/fdbclient/ConfigTransactionInterface.h @@ -65,16 +65,18 @@ struct ConfigTransactionGetGenerationReply { struct ConfigTransactionGetGenerationRequest { static constexpr FileIdentifier file_identifier = 138941; + CoordinatorsHash coordinatorsHash{ 0 }; // A hint to catch up lagging nodes: Optional lastSeenLiveVersion; ReplyPromise reply; ConfigTransactionGetGenerationRequest() = default; - explicit ConfigTransactionGetGenerationRequest(Optional const& lastSeenLiveVersion) - : lastSeenLiveVersion(lastSeenLiveVersion) {} + explicit ConfigTransactionGetGenerationRequest(CoordinatorsHash coordinatorsHash, + Optional const& lastSeenLiveVersion) + : coordinatorsHash(coordinatorsHash), lastSeenLiveVersion(lastSeenLiveVersion) {} template void serialize(Ar& ar) { - serializer(ar, lastSeenLiveVersion, reply); + serializer(ar, coordinatorsHash, lastSeenLiveVersion, reply); } }; @@ -92,39 +94,43 @@ struct ConfigTransactionGetReply { struct ConfigTransactionGetRequest { static constexpr FileIdentifier file_identifier = 923040; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation; ConfigKey key; ReplyPromise reply; ConfigTransactionGetRequest() = default; - explicit ConfigTransactionGetRequest(ConfigGeneration generation, ConfigKey key) - : generation(generation), key(key) {} + explicit ConfigTransactionGetRequest(CoordinatorsHash coordinatorsHash, ConfigGeneration generation, ConfigKey key) + : coordinatorsHash(coordinatorsHash), generation(generation), key(key) {} template void serialize(Ar& ar) { - serializer(ar, generation, key, reply); + serializer(ar, coordinatorsHash, generation, key, reply); } }; struct ConfigTransactionCommitRequest { static constexpr FileIdentifier file_identifier = 103841; Arena arena; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation{ ::invalidVersion, ::invalidVersion }; VectorRef mutations; ConfigCommitAnnotationRef annotation; ReplyPromise reply; ConfigTransactionCommitRequest() = default; - explicit ConfigTransactionCommitRequest(ConfigGeneration generation, + explicit ConfigTransactionCommitRequest(CoordinatorsHash coordinatorsHash, + ConfigGeneration generation, VectorRef mutations, ConfigCommitAnnotationRef annotation) - : generation(generation), mutations(arena, mutations), annotation(arena, annotation) {} + : coordinatorsHash(coordinatorsHash), generation(generation), mutations(arena, mutations), + annotation(arena, annotation) {} size_t expectedSize() const { return mutations.expectedSize() + annotation.expectedSize(); } template void serialize(Ar& ar) { - serializer(ar, arena, generation, mutations, annotation, reply); + serializer(ar, coordinatorsHash, generation, mutations, annotation, reply, arena); } }; @@ -144,15 +150,17 @@ struct ConfigTransactionGetConfigClassesReply { struct ConfigTransactionGetConfigClassesRequest { static constexpr FileIdentifier file_identifier = 7163400; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation; ReplyPromise reply; ConfigTransactionGetConfigClassesRequest() = default; - explicit ConfigTransactionGetConfigClassesRequest(ConfigGeneration generation) : generation(generation) {} + explicit ConfigTransactionGetConfigClassesRequest(CoordinatorsHash coordinatorsHash, ConfigGeneration generation) + : coordinatorsHash(coordinatorsHash), generation(generation) {} template void serialize(Ar& ar) { - serializer(ar, generation); + serializer(ar, coordinatorsHash, generation); } }; @@ -171,17 +179,20 @@ struct ConfigTransactionGetKnobsReply { struct ConfigTransactionGetKnobsRequest { static constexpr FileIdentifier file_identifier = 987410; + CoordinatorsHash coordinatorsHash{ 0 }; ConfigGeneration generation; Optional configClass; ReplyPromise reply; ConfigTransactionGetKnobsRequest() = default; - explicit ConfigTransactionGetKnobsRequest(ConfigGeneration generation, Optional configClass) - : generation(generation), configClass(configClass) {} + explicit ConfigTransactionGetKnobsRequest(CoordinatorsHash coordinatorsHash, + ConfigGeneration generation, + Optional configClass) + : coordinatorsHash(coordinatorsHash), generation(generation), configClass(configClass) {} template void serialize(Ar& ar) { - serializer(ar, generation, configClass, reply); + serializer(ar, coordinatorsHash, generation, configClass, reply); } }; diff --git a/fdbclient/include/fdbclient/ConsistencyScanInterface.actor.h b/fdbclient/include/fdbclient/ConsistencyScanInterface.actor.h new file mode 100644 index 0000000000..7d6529ced0 --- /dev/null +++ b/fdbclient/include/fdbclient/ConsistencyScanInterface.actor.h @@ -0,0 +1,196 @@ +/* + * ConsistencyScanInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_G_H) +#define FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_G_H +#include "fdbclient/ConsistencyScanInterface.actor.g.h" +#elif !defined(FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_H) +#define FDBCLIENT_CONSISTENCYSCANINTERFACE_ACTOR_H + +#include "fdbclient/CommitProxyInterface.h" +#include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/RunTransaction.actor.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbrpc/Locality.h" + +#include "flow/actorcompiler.h" // must be last include + +struct ConsistencyScanInterface { + constexpr static FileIdentifier file_identifier = 4983265; + RequestStream> waitFailure; + RequestStream haltConsistencyScan; + struct LocalityData locality; + UID myId; + + ConsistencyScanInterface() {} + explicit ConsistencyScanInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {} + + void initEndpoints() {} + UID id() const { return myId; } + NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); } + bool operator==(const ConsistencyScanInterface& r) const { return id() == r.id(); } + bool operator!=(const ConsistencyScanInterface& r) const { return !(*this == r); } + + template + void serialize(Archive& ar) { + serializer(ar, waitFailure, haltConsistencyScan, locality, myId); + } +}; + +struct HaltConsistencyScanRequest { + constexpr static FileIdentifier file_identifier = 2323417; + UID requesterID; + ReplyPromise reply; + + HaltConsistencyScanRequest() {} + explicit HaltConsistencyScanRequest(UID uid) : requesterID(uid) {} + + template + void serialize(Ar& ar) { + serializer(ar, requesterID, reply); + } +}; + +// consistency scan configuration and metrics +struct ConsistencyScanInfo { + constexpr static FileIdentifier file_identifier = 732125; + bool consistency_scan_enabled = false; + bool restart = false; + int64_t max_rate = 0; + int64_t target_interval = CLIENT_KNOBS->CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME; + int64_t bytes_read_prev_round = 0; + KeyRef progress_key = KeyRef(); + + // Round Metrics - one round of complete validation across all SSs + // Start and finish are in epoch seconds + double last_round_start = 0; + double last_round_finish = 0; + TimerSmoother smoothed_round_duration; + int finished_rounds = 0; + + ConsistencyScanInfo() : smoothed_round_duration(20.0 * 60) {} + ConsistencyScanInfo(bool enabled, bool r, uint64_t rate, uint64_t interval) + : consistency_scan_enabled(enabled), restart(r), max_rate(rate), target_interval(interval), + smoothed_round_duration(20.0 * 60) {} + + template + void serialize(Ar& ar) { + double round_total; + if (!ar.isDeserializing) { + round_total = smoothed_round_duration.getTotal(); + } + serializer(ar, + consistency_scan_enabled, + restart, + max_rate, + target_interval, + bytes_read_prev_round, + last_round_start, + last_round_finish, + round_total, + finished_rounds); + if (ar.isDeserializing) { + smoothed_round_duration.reset(round_total); + } + } + + static Future setInfo(Reference tr, ConsistencyScanInfo info) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->set(consistencyScanInfoKey, ObjectWriter::toValue(info, IncludeVersion())); + return Void(); + } + + static Future setInfo(Database cx, ConsistencyScanInfo info) { + return runRYWTransaction( + cx, [=](Reference tr) -> Future { return setInfo(tr, info); }); + } + + static Future> getInfo(Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + return tr->get(consistencyScanInfoKey); + } + + static Future> getInfo(Database cx) { + return runRYWTransaction( + cx, [=](Reference tr) -> Future> { return getInfo(tr); }); + } + + StatusObject toJSON() const { + StatusObject result; + result["consistency_scan_enabled"] = consistency_scan_enabled; + result["restart"] = restart; + result["max_rate"] = max_rate; + result["target_interval"] = target_interval; + result["bytes_read_prev_round"] = bytes_read_prev_round; + result["last_round_start_datetime"] = epochsToGMTString(last_round_start); + result["last_round_finish_datetime"] = epochsToGMTString(last_round_finish); + result["last_round_start_timestamp"] = last_round_start; + result["last_round_finish_timestamp"] = last_round_finish; + result["smoothed_round_seconds"] = smoothed_round_duration.smoothTotal(); + result["finished_rounds"] = finished_rounds; + return result; + } + + std::string toString() const { + return format("consistency_scan_enabled = %d, restart = %d, max_rate = %ld, target_interval = %ld", + consistency_scan_enabled, + restart, + max_rate, + target_interval); + } +}; + +ACTOR Future getVersion(Database cx); +ACTOR Future getKeyServers( + Database cx, + Promise>>> keyServersPromise, + KeyRangeRef kr, + bool performQuiescentChecks); +ACTOR Future getKeyLocations(Database cx, + std::vector>> shards, + Promise>> keyLocationPromise, + bool performQuiescentChecks); +ACTOR Future checkDataConsistency(Database cx, + VectorRef keyLocations, + DatabaseConfiguration configuration, + std::map tssMapping, + bool performQuiescentChecks, + bool performTSSCheck, + bool firstClient, + bool failureIsError, + int clientId, + int clientCount, + bool distributed, + bool shuffleShards, + int shardSampleFactor, + int64_t sharedRandomNumber, + int64_t repetitions, + int64_t* bytesReadInPreviousRound, + int restart, + int64_t maxRate, + int64_t targetInterval, + KeyRef progressKey); + +#include "flow/unactorcompiler.h" + +#endif // FDBCLIENT_CONSISTENCYSCANINTERFACE_H \ No newline at end of file diff --git a/fdbclient/include/fdbclient/CoordinationInterface.h b/fdbclient/include/fdbclient/CoordinationInterface.h index 7ccaf9170a..c5d53d9ad7 100644 --- a/fdbclient/include/fdbclient/CoordinationInterface.h +++ b/fdbclient/include/fdbclient/CoordinationInterface.h @@ -63,6 +63,8 @@ struct ClientLeaderRegInterface { // - There is no address present more than once class ClusterConnectionString { public: + constexpr static FileIdentifier file_identifier = 13602011; + ClusterConnectionString() {} ClusterConnectionString(const std::string& connectionString); ClusterConnectionString(const std::vector& coordinators, Key key); @@ -84,9 +86,22 @@ public: std::vector coords; std::vector hostnames; + size_t getNumberOfCoordinators() const { return coords.size() + hostnames.size(); } + + bool operator==(const ClusterConnectionString& other) const noexcept { + return key == other.key && keyDesc == other.keyDesc && coords == other.coords && hostnames == other.hostnames; + } + bool operator!=(const ClusterConnectionString& other) const noexcept { return !(*this == other); } + private: void parseConnString(); Key key, keyDesc; + +public: + template + void serialize(Ar& ar) { + serializer(ar, coords, hostnames, key, keyDesc); + } }; FDB_DECLARE_BOOLEAN_PARAM(ConnectionStringNeedsPersisted); @@ -229,6 +244,8 @@ struct GetLeaderRequest { GetLeaderRequest() {} explicit GetLeaderRequest(Key key, UID kl) : key(key), knownLeader(kl) {} + bool verify() const { return true; } + template void serialize(Ar& ar) { serializer(ar, key, knownLeader, reply); @@ -249,6 +266,8 @@ struct OpenDatabaseCoordRequest { std::vector coordinators; ReplyPromise> reply; + bool verify() const { return true; } + template void serialize(Ar& ar) { serializer(ar, diff --git a/fdbclient/include/fdbclient/DatabaseConfiguration.h b/fdbclient/include/fdbclient/DatabaseConfiguration.h index 363b48e4b6..f73cd99c9d 100644 --- a/fdbclient/include/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/include/fdbclient/DatabaseConfiguration.h @@ -256,6 +256,8 @@ struct DatabaseConfiguration { bool blobGranulesEnabled; TenantMode tenantMode; + EncryptionAtRestMode encryptionAtRestMode; + // Excluded servers (no state should be here) bool isExcludedServer(NetworkAddressList) const; bool isExcludedLocality(const LocalityData& locality) const; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index 27e499d607..8ec21c5472 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -21,10 +21,12 @@ #ifndef DatabaseContext_h #define DatabaseContext_h #include "fdbclient/Notified.h" +#include "flow/ApiVersion.h" #include "flow/FastAlloc.h" #include "flow/FastRef.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/StorageServerInterface.h" +#include "flow/IRandom.h" #include "flow/genericactors.actor.h" #include #include @@ -167,10 +169,11 @@ struct ChangeFeedStorageData : ReferenceCounted { Future updater; NotifiedVersion version; NotifiedVersion desired; - Promise destroyed; UID interfToken; + DatabaseContext* context; + double created; - ~ChangeFeedStorageData() { destroyed.send(Void()); } + ~ChangeFeedStorageData(); }; struct ChangeFeedData : ReferenceCounted { @@ -180,6 +183,8 @@ struct ChangeFeedData : ReferenceCounted { Version getVersion(); Future whenAtLeast(Version version); + UID dbgid; + DatabaseContext* context; NotifiedVersion lastReturnedVersion; std::vector> storageData; AsyncVar notAtLatest; @@ -188,8 +193,10 @@ struct ChangeFeedData : ReferenceCounted { Version endVersion = invalidVersion; Version popVersion = invalidVersion; // like TLog pop version, set by SS and client can check it to see if they missed data + double created = 0; - ChangeFeedData() : notAtLatest(1) {} + explicit ChangeFeedData(DatabaseContext* context = nullptr); + ~ChangeFeedData(); }; struct EndpointFailureInfo { @@ -207,6 +214,16 @@ struct KeyRangeLocationInfo { : tenantEntry(tenantEntry), range(range), locations(locations) {} }; +struct OverlappingChangeFeedsInfo { + Arena arena; + VectorRef feeds; + // would prefer to use key range map but it complicates copy/move constructors + std::vector> feedMetadataVersions; + + // for a feed that wasn't present, returns the metadata version it would have been fetched at. + Version getFeedMetadataVersion(const KeyRangeRef& feedRange) const; +}; + class DatabaseContext : public ReferenceCounted, public FastAllocated, NonCopyable { public: static DatabaseContext* allocateOnForeignThread() { @@ -221,7 +238,7 @@ public: EnableLocalityLoadBalance, TaskPriority taskID = TaskPriority::DefaultEndpoint, LockAware = LockAware::False, - int apiVersion = Database::API_VERSION_LATEST, + int _apiVersion = ApiVersion::LATEST_VERSION, IsSwitchable = IsSwitchable::False); ~DatabaseContext(); @@ -237,7 +254,7 @@ public: enableLocalityLoadBalance, lockAware, internal, - apiVersion, + apiVersion.version(), switchable, defaultTenant)); cx->globalConfig->init(Reference const>(cx->clientInfo), @@ -245,16 +262,16 @@ public: return cx; } - Optional getCachedLocation(const Optional& tenant, + Optional getCachedLocation(const Optional& tenant, const KeyRef&, Reverse isBackward = Reverse::False); - bool getCachedLocations(const Optional& tenant, + bool getCachedLocations(const Optional& tenant, const KeyRangeRef&, std::vector&, int limit, Reverse reverse); void cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry); - Reference setCachedLocation(const Optional& tenant, + Reference setCachedLocation(const Optional& tenant, const TenantMapEntry& tenantEntry, const KeyRangeRef&, const std::vector&); @@ -328,7 +345,7 @@ public: } } - int apiVersionAtLeast(int minVersion) const { return apiVersion < 0 || apiVersion >= minVersion; } + int apiVersionAtLeast(int minVersion) const { return apiVersion.version() >= minVersion; } Future onConnected(); // Returns after a majority of coordination servers are available and have reported a // leader. The cluster file therefore is valid, but the database might be unavailable. @@ -361,15 +378,21 @@ public: int replyBufferSize = -1, bool canReadPopped = true); - Future> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion); + Future getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion); Future popChangeFeedMutations(Key rangeID, Version version); + // BlobGranule API. Future purgeBlobGranules(KeyRange keyRange, Version purgeVersion, Optional tenant, bool force = false); Future waitPurgeGranulesComplete(Key purgeKey); + Future blobbifyRange(KeyRange range); + Future unblobbifyRange(KeyRange range); + Future>> listBlobbifiedRanges(KeyRange range, int rangeLimit); + Future verifyBlobRange(const KeyRange& range, Optional version); + // private: explicit DatabaseContext(Reference>> connectionRecord, Reference> clientDBInfo, @@ -380,7 +403,7 @@ public: EnableLocalityLoadBalance, LockAware, IsInternal = IsInternal::True, - int apiVersion = Database::API_VERSION_LATEST, + int _apiVersion = ApiVersion::LATEST_VERSION, IsSwitchable = IsSwitchable::False, Optional defaultTenant = Optional()); @@ -457,9 +480,12 @@ public: std::unordered_map> tssMetrics; // map from changeFeedId -> changeFeedRange std::unordered_map changeFeedCache; - std::unordered_map> changeFeedUpdaters; + std::unordered_map changeFeedUpdaters; + std::map notAtLatestChangeFeeds; Reference getStorageData(StorageServerInterface interf); + Version getMinimumChangeFeedVersion(); + void setDesiredChangeFeedVersion(Version v); // map from ssid -> ss tag // @note this map allows the client to identify the latest commit versions @@ -517,6 +543,19 @@ public: Counter transactionsExpensiveClearCostEstCount; Counter transactionGrvFullBatches; Counter transactionGrvTimedOutBatches; + Counter transactionCommitVersionNotFoundForSS; + Counter bgReadInputBytes; + Counter bgReadOutputBytes; + + // Change Feed metrics. Omit change feed metrics from logging if not used + bool usedAnyChangeFeeds; + CounterCollection ccFeed; + Counter feedStreamStarts; + Counter feedMergeStreamStarts; + Counter feedErrors; + Counter feedNonRetriableErrors; + Counter feedPops; + Counter feedPopsFallback; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit, bgLatencies, bgGranulesPerRequest; @@ -569,7 +608,7 @@ public: Future statusLeaderMon; double lastStatusFetch; - int apiVersion; + ApiVersion apiVersion; int mvCacheInsertLocation; std::vector>> metadataVersionCache; @@ -638,14 +677,18 @@ private: // Similar to tr.onError(), but doesn't require a DatabaseContext. struct Backoff { + Backoff(double backoff = CLIENT_KNOBS->DEFAULT_BACKOFF, double maxBackoff = CLIENT_KNOBS->DEFAULT_MAX_BACKOFF) + : backoff(backoff), maxBackoff(maxBackoff) {} + Future onError() { double currentBackoff = backoff; - backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, CLIENT_KNOBS->DEFAULT_MAX_BACKOFF); + backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, maxBackoff); return delay(currentBackoff * deterministicRandom()->random01()); } private: - double backoff = CLIENT_KNOBS->DEFAULT_BACKOFF; + double backoff; + double maxBackoff; }; #endif diff --git a/fdbserver/include/fdbserver/EncryptKeyProxyInterface.h b/fdbclient/include/fdbclient/EncryptKeyProxyInterface.h similarity index 95% rename from fdbserver/include/fdbserver/EncryptKeyProxyInterface.h rename to fdbclient/include/fdbclient/EncryptKeyProxyInterface.h index 12178b11ab..5f4d56eb96 100644 --- a/fdbserver/include/fdbserver/EncryptKeyProxyInterface.h +++ b/fdbclient/include/fdbclient/EncryptKeyProxyInterface.h @@ -132,7 +132,7 @@ struct EKPGetBaseCipherKeysByIdsReply { template void serialize(Ar& ar) { - serializer(ar, arena, baseCipherDetails, numHits, error); + serializer(ar, baseCipherDetails, numHits, error, arena); } }; @@ -144,10 +144,10 @@ struct EKPGetBaseCipherKeysRequestInfo { EncryptCipherBaseKeyId baseCipherId; // Encryption domain name - ancillairy metadata information, an encryption key should be uniquely identified by // {domainId, cipherBaseId} tuple - EncryptCipherDomainName domainName; + EncryptCipherDomainNameRef domainName; EKPGetBaseCipherKeysRequestInfo() - : domainId(ENCRYPT_INVALID_DOMAIN_ID), baseCipherId(ENCRYPT_INVALID_CIPHER_KEY_ID) {} + : domainId(INVALID_ENCRYPT_DOMAIN_ID), baseCipherId(INVALID_ENCRYPT_CIPHER_KEY_ID) {} EKPGetBaseCipherKeysRequestInfo(const EncryptCipherDomainId dId, const EncryptCipherBaseKeyId bCId, StringRef name, @@ -176,7 +176,7 @@ struct EKPGetBaseCipherKeysByIdsRequest { template void serialize(Ar& ar) { - serializer(ar, arena, baseCipherInfos, debugId, reply); + serializer(ar, baseCipherInfos, debugId, reply, arena); } }; @@ -193,7 +193,7 @@ struct EKPGetLatestBaseCipherKeysReply { template void serialize(Ar& ar) { - serializer(ar, arena, baseCipherDetails, numHits, error); + serializer(ar, baseCipherDetails, numHits, error, arena); } }; @@ -203,9 +203,9 @@ struct EKPGetLatestCipherKeysRequestInfo { EncryptCipherDomainId domainId; // Encryption domain name - ancillairy metadata information, an encryption key should be uniquely identified by // {domainId, cipherBaseId} tuple - EncryptCipherDomainName domainName; + EncryptCipherDomainNameRef domainName; - EKPGetLatestCipherKeysRequestInfo() : domainId(ENCRYPT_INVALID_DOMAIN_ID) {} + EKPGetLatestCipherKeysRequestInfo() : domainId(INVALID_ENCRYPT_DOMAIN_ID) {} EKPGetLatestCipherKeysRequestInfo(const EncryptCipherDomainId dId, StringRef name, Arena& arena) : domainId(dId), domainName(StringRef(arena, name)) {} @@ -239,7 +239,7 @@ struct EKPGetLatestBaseCipherKeysRequest { template void serialize(Ar& ar) { - serializer(ar, arena, encryptDomainInfos, debugId, reply); + serializer(ar, encryptDomainInfos, debugId, reply, arena); } }; diff --git a/fdbclient/include/fdbclient/EventTypes.actor.h b/fdbclient/include/fdbclient/EventTypes.actor.h index 39a75e09dc..dc946ce42e 100644 --- a/fdbclient/include/fdbclient/EventTypes.actor.h +++ b/fdbclient/include/fdbclient/EventTypes.actor.h @@ -26,7 +26,7 @@ #define FDBCLIENT_EVENTTYPES_ACTOR_G_H #include "fdbclient/EventTypes.actor.g.h" #elif !defined(FDBCLIENT_EVENTTYPES_ACTOR_H) -#define FDBCLIENT_EVENTTYPESS_ACTOR_H +#define FDBCLIENT_EVENTTYPES_ACTOR_H #include "flow/flow.h" #include "flow/TDMetric.actor.h" diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index 15cd4fc527..4da1d4f72b 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -41,6 +41,7 @@ typedef StringRef KeyRef; typedef StringRef ValueRef; typedef int64_t Generation; typedef UID SpanID; +typedef uint64_t CoordinatorsHash; enum { tagLocalitySpecial = -1, // tag with this locality means it is invalidTag (id=0), txsTag (id=1), or cacheTag (id=2) @@ -331,6 +332,22 @@ struct KeyRangeRef { bool empty() const { return begin == end; } bool singleKeyRange() const { return equalsKeyAfter(begin, end); } + // Return true if it's fully covered by given range list. Note that ranges should be sorted + bool isCovered(std::vector& ranges) { + ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder())); + KeyRangeRef clone(begin, end); + for (auto r : ranges) { + if (begin < r.begin) + return false; // uncovered gap between clone.begin and r.begin + if (end <= r.end) + return true; // range is fully covered + if (end > r.begin) + // {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end} + clone = KeyRangeRef(r.end, clone.end); + } + return false; + } + Standalone withPrefix(const StringRef& prefix) const { return KeyRangeRef(begin.withPrefix(prefix), end.withPrefix(prefix)); } @@ -500,10 +517,36 @@ using KeySelector = Standalone; using RangeResult = Standalone; using MappedRangeResult = Standalone; +namespace std { +template <> +struct hash { + static constexpr std::hash hashFunc{}; + std::size_t operator()(KeyRangeRef const& range) const { + std::size_t seed = 0; + boost::hash_combine(seed, hashFunc(range.begin)); + boost::hash_combine(seed, hashFunc(range.end)); + return seed; + } +}; +} // namespace std + +namespace std { +template <> +struct hash { + static constexpr std::hash hashFunc{}; + std::size_t operator()(KeyRangeRef const& range) const { + std::size_t seed = 0; + boost::hash_combine(seed, hashFunc(range.begin)); + boost::hash_combine(seed, hashFunc(range.end)); + return seed; + } +}; +} // namespace std + enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits::max() }; inline Key keyAfter(const KeyRef& key) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; Standalone r; @@ -516,7 +559,7 @@ inline Key keyAfter(const KeyRef& key) { return r; } inline KeyRef keyAfter(const KeyRef& key, Arena& arena) { - if (key == LiteralStringRef("\xff\xff")) + if (key == "\xff\xff"_sr) return key; uint8_t* t = new (arena) uint8_t[key.size() + 1]; memcpy(t, key.begin(), key.size()); @@ -931,17 +974,17 @@ struct TLogVersion { } static ErrorOr FromStringRef(StringRef s) { - if (s == LiteralStringRef("2")) + if (s == "2"_sr) return V2; - if (s == LiteralStringRef("3")) + if (s == "3"_sr) return V3; - if (s == LiteralStringRef("4")) + if (s == "4"_sr) return V4; - if (s == LiteralStringRef("5")) + if (s == "5"_sr) return V5; - if (s == LiteralStringRef("6")) + if (s == "6"_sr) return V6; - if (s == LiteralStringRef("7")) + if (s == "7"_sr) return V7; return default_error_or(); } @@ -991,9 +1034,9 @@ struct TLogSpillType { } static ErrorOr FromStringRef(StringRef s) { - if (s == LiteralStringRef("1")) + if (s == "1"_sr) return VALUE; - if (s == LiteralStringRef("2")) + if (s == "2"_sr) return REFERENCE; return default_error_or(); } @@ -1392,6 +1435,60 @@ struct TenantMode { uint32_t mode; }; +struct EncryptionAtRestMode { + // These enumerated values are stored in the database configuration, so can NEVER be changed. Only add new ones + // just before END. + enum Mode { DISABLED = 0, AES_256_CTR = 1, END = 2 }; + + EncryptionAtRestMode() : mode(DISABLED) {} + EncryptionAtRestMode(Mode mode) : mode(mode) { + if ((uint32_t)mode >= END) { + this->mode = DISABLED; + } + } + operator Mode() const { return Mode(mode); } + + template + void serialize(Ar& ar) { + serializer(ar, mode); + } + + std::string toString() const { + switch (mode) { + case DISABLED: + return "disabled"; + case AES_256_CTR: + return "aes_256_ctr"; + default: + ASSERT(false); + } + return ""; + } + + Value toValue() const { return ValueRef(format("%d", (int)mode)); } + + static EncryptionAtRestMode fromValue(Optional val) { + if (!val.present()) { + return DISABLED; + } + + // A failed parsing returns 0 (DISABLED) + int num = atoi(val.get().toString().c_str()); + if (num < 0 || num >= END) { + return DISABLED; + } + + return static_cast(num); + } + + uint32_t mode; +}; + +typedef StringRef ClusterNameRef; +typedef Standalone ClusterName; + +enum class ClusterType { STANDALONE, METACLUSTER_MANAGEMENT, METACLUSTER_DATA }; + struct GRVCacheSpace { Version cachedReadVersion; double lastGrvTime; @@ -1413,7 +1510,7 @@ struct DatabaseSharedState { std::atomic refCount; DatabaseSharedState() - : protocolVersion(currentProtocolVersion), mutexLock(Mutex()), grvCacheSpace(GRVCacheSpace()), refCount(0) {} + : protocolVersion(currentProtocolVersion()), mutexLock(Mutex()), grvCacheSpace(GRVCacheSpace()), refCount(0) {} }; inline bool isValidPerpetualStorageWiggleLocality(std::string locality) { @@ -1460,7 +1557,7 @@ struct StorageMetadataType { bool wrongConfigured = false; StorageMetadataType() : createdTime(0) {} - StorageMetadataType(uint64_t t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false) + StorageMetadataType(double t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false) : createdTime(t), storeType(storeType), wrongConfigured(wrongConfigured) {} static double currentTime() { return g_network->timer(); } @@ -1510,6 +1607,42 @@ struct StorageWiggleValue { } }; +enum class ReadType { + EAGER, + FETCH, + LOW, + NORMAL, + HIGH, +}; + +FDB_DECLARE_BOOLEAN_PARAM(CacheResult); + +// store options for storage engine read +// ReadType describes the usage and priority of the read +// cacheResult determines whether the storage engine cache for this read +// consistencyCheckStartVersion indicates the consistency check which began at this version +// debugID helps to trace the path of the read +struct ReadOptions { + ReadType type; + // Once CacheResult is serializable, change type from bool to CacheResult + bool cacheResult; + Optional debugID; + Optional consistencyCheckStartVersion; + + ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){}; + + ReadOptions(Optional debugID, + ReadType type = ReadType::NORMAL, + CacheResult cache = CacheResult::False, + Optional version = Optional()) + : type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){}; + + template + void serialize(Ar& ar) { + serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion); + } +}; + // Can be used to identify types (e.g. IDatabase) that can be used to create transactions with a `createTransaction` // function template diff --git a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h index 8fe0d08fd2..21ebbd3a3c 100644 --- a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h @@ -39,6 +39,7 @@ the contents of the system key space. #include "fdbclient/Status.h" #include "fdbclient/Subspace.h" #include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/Metacluster.h" #include "fdbclient/Status.h" #include "fdbclient/SystemData.h" #include "flow/actorcompiler.h" // has to be last include @@ -69,6 +70,8 @@ enum class ConfigurationResult { SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL, DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL, DATABASE_CREATED_WARN_SHARDED_ROCKSDB_EXPERIMENTAL, + DATABASE_IS_REGISTERED, + ENCRYPTION_AT_REST_MODE_ALREADY_SET }; enum class CoordinatorsResult { @@ -246,7 +249,7 @@ Future> getWorkers(Reference tr, // Accepts a full configuration in key/value format (from buildConfiguration) ACTOR template Future changeConfig(Reference db, std::map m, bool force) { - state StringRef initIdKey = LiteralStringRef("\xff/init_id"); + state StringRef initIdKey = "\xff/init_id"_sr; state Reference tr = db->createTransaction(); if (!m.size()) { @@ -272,6 +275,9 @@ Future changeConfig(Reference db, std::map tooLong = delay(60); @@ -475,6 +481,14 @@ Future changeConfig(Reference db, std::map metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + if (metaclusterRegistration.present()) { + return ConfigurationResult::DATABASE_IS_REGISTERED; + } + } } } if (creating) { @@ -491,8 +505,8 @@ Future changeConfig(Reference db, std::mapatomicOp(databaseLockedKey, BinaryWriter::toValue(locked.get(), Unversioned()) - .withPrefix(LiteralStringRef("0123456789")) - .withSuffix(LiteralStringRef("\x00\x00\x00\x00")), + .withPrefix("0123456789"_sr) + .withSuffix("\x00\x00\x00\x00"_sr), MutationRef::SetVersionstampedValue); } @@ -636,7 +650,7 @@ Future changeConfig(Reference db, std::vector const& modes, Optional const& conf, bool force) { - if (modes.size() && modes[0] == LiteralStringRef("auto") && conf.present()) { + if (modes.size() && modes[0] == "auto"_sr && conf.present()) { return autoConfig(db, conf.get()); } diff --git a/fdbserver/GetEncryptCipherKeys.actor.cpp b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h similarity index 56% rename from fdbserver/GetEncryptCipherKeys.actor.cpp rename to fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h index 6b77d24d03..0f93675a6a 100644 --- a/fdbserver/GetEncryptCipherKeys.actor.cpp +++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h @@ -1,5 +1,5 @@ /* - * GetEncryptCipherKeys.actor.cpp + * GetEncryptCipherKeys.actor.h * * This source file is part of the FoundationDB open source project * @@ -17,20 +17,31 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H) +#define FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H +#include "fdbclient/GetEncryptCipherKeys.actor.g.h" +#elif !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_H) +#define FDBCLIENT_GETCIPHERKEYS_ACTOR_H -#include "fdbserver/GetEncryptCipherKeys.actor.h" +#include "fdbclient/BlobCipher.h" +#include "fdbclient/EncryptKeyProxyInterface.h" +#include "fdbrpc/Stats.h" +#include "flow/Knobs.h" +#include "flow/IRandom.h" -#include +#include +#include -#include "flow/actorcompiler.h" // has to be last include +#include "flow/actorcompiler.h" // This must be the last #include. -namespace { - -Optional getEncryptKeyProxyId(const Reference const>& db) { - return db->get().encryptKeyProxy.map([](EncryptKeyProxyInterface proxy) { return proxy.id(); }); +template +Optional getEncryptKeyProxyId(const Reference const>& db) { + return db->get().encryptKeyProxy.template map([](EncryptKeyProxyInterface proxy) { return proxy.id(); }); } -ACTOR Future onEncryptKeyProxyChange(Reference const> db) { +ACTOR template +Future onEncryptKeyProxyChange(Reference const> db) { state Optional previousProxyId = getEncryptKeyProxyId(db); state Optional currentProxyId; loop { @@ -46,9 +57,9 @@ ACTOR Future onEncryptKeyProxyChange(Reference cons return Void(); } -ACTOR Future getUncachedLatestEncryptCipherKeys( - Reference const> db, - EKPGetLatestBaseCipherKeysRequest request) { +ACTOR template +Future getUncachedLatestEncryptCipherKeys(Reference const> db, + EKPGetLatestBaseCipherKeysRequest request) { Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. @@ -73,11 +84,14 @@ ACTOR Future getUncachedLatestEncryptCipherKeys } } -} // anonymous namespace - -ACTOR Future>> getLatestEncryptCipherKeys( - Reference const> db, - std::unordered_map domains) { +// Get latest cipher keys for given encryption domains. It tries to get the cipher keys from local cache. +// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future>> getLatestEncryptCipherKeys( + Reference const> db, + std::unordered_map domains, + BlobCipherMetrics::UsageType usageType) { state Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); state std::unordered_map> cipherKeys; state EKPGetLatestBaseCipherKeysRequest request; @@ -101,14 +115,19 @@ ACTOR Future> return cipherKeys; } // Fetch any uncached cipher keys. + state double startTime = now(); loop choose { when(EKPGetLatestBaseCipherKeysReply reply = wait(getUncachedLatestEncryptCipherKeys(db, request))) { // Insert base cipher keys into cache and construct result. for (const EKPBaseCipherDetails& details : reply.baseCipherDetails) { EncryptCipherDomainId domainId = details.encryptDomainId; if (domains.count(domainId) > 0 && cipherKeys.count(domainId) == 0) { - Reference cipherKey = cipherKeyCache->insertCipherKey( - domainId, details.baseCipherId, details.baseCipherKey.begin(), details.baseCipherKey.size()); + Reference cipherKey = cipherKeyCache->insertCipherKey(domainId, + details.baseCipherId, + details.baseCipherKey.begin(), + details.baseCipherKey.size(), + details.refreshAt, + details.expireAt); ASSERT(cipherKey.isValid()); cipherKeys[domainId] = cipherKey; } @@ -125,13 +144,30 @@ ACTOR Future> // In case encryptKeyProxy has changed, retry the request. when(wait(onEncryptKeyProxyChange(db))) {} } + double elapsed = now() - startTime; + BlobCipherMetrics::getInstance()->getLatestCipherKeysLatency.addMeasurement(elapsed); + BlobCipherMetrics::counters(usageType).getLatestCipherKeysLatency.addMeasurement(elapsed); return cipherKeys; } -namespace { +// Get latest cipher key for given a encryption domain. It tries to get the cipher key from the local cache. +// In case of cache miss, it fetches the cipher key from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future> getLatestEncryptCipherKey(Reference const> db, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName, + BlobCipherMetrics::UsageType usageType) { + std::unordered_map domains({ { domainId, domainName } }); + std::unordered_map> cipherKey = + wait(getLatestEncryptCipherKeys(db, domains, usageType)); -ACTOR Future getUncachedEncryptCipherKeys(Reference const> db, - EKPGetBaseCipherKeysByIdsRequest request) { + return cipherKey.at(domainId); +} + +ACTOR template +Future getUncachedEncryptCipherKeys(Reference const> db, + EKPGetBaseCipherKeysByIdsRequest request) { Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. @@ -158,11 +194,14 @@ ACTOR Future getUncachedEncryptCipherKeys(Refere using BaseCipherIndex = std::pair; -} // anonymous namespace - -ACTOR Future>> getEncryptCipherKeys( - Reference const> db, - std::unordered_set cipherDetails) { +// Get cipher keys specified by the list of cipher details. It tries to get the cipher keys from local cache. +// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future>> getEncryptCipherKeys( + Reference const> db, + std::unordered_set cipherDetails, + BlobCipherMetrics::UsageType usageType) { state Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); state std::unordered_map> cipherKeys; state std::unordered_set> uncachedBaseCipherIds; @@ -191,12 +230,13 @@ ACTOR Future>> ge id.first /*domainId*/, id.second /*baseCipherId*/, StringRef() /*domainName*/, request.arena); } // Fetch any uncached cipher keys. + state double startTime = now(); loop choose { when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request))) { - std::unordered_map> baseCipherKeys; + std::unordered_map> baseCipherKeys; for (const EKPBaseCipherDetails& baseDetails : reply.baseCipherDetails) { BaseCipherIndex baseIdx = std::make_pair(baseDetails.encryptDomainId, baseDetails.baseCipherId); - baseCipherKeys[baseIdx] = baseDetails.baseCipherKey; + baseCipherKeys[baseIdx] = baseDetails; } // Insert base cipher keys into cache and construct result. for (const BlobCipherDetails& details : cipherDetails) { @@ -213,9 +253,11 @@ ACTOR Future>> ge } Reference cipherKey = cipherKeyCache->insertCipherKey(details.encryptDomainId, details.baseCipherId, - itr->second.begin(), - itr->second.size(), - details.salt); + itr->second.baseCipherKey.begin(), + itr->second.baseCipherKey.size(), + details.salt, + itr->second.refreshAt, + itr->second.expireAt); ASSERT(cipherKey.isValid()); cipherKeys[details] = cipherKey; } @@ -224,30 +266,49 @@ ACTOR Future>> ge // In case encryptKeyProxy has changed, retry the request. when(wait(onEncryptKeyProxyChange(db))) {} } + double elapsed = now() - startTime; + BlobCipherMetrics::getInstance()->getCipherKeysLatency.addMeasurement(elapsed); + BlobCipherMetrics::counters(usageType).getCipherKeysLatency.addMeasurement(elapsed); return cipherKeys; } -ACTOR Future getLatestSystemEncryptCipherKeys(Reference const> db) { - static std::unordered_map domains = { - { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }, - { ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME } - }; +struct TextAndHeaderCipherKeys { + Reference cipherTextKey; + Reference cipherHeaderKey; +}; + +ACTOR template +Future getLatestEncryptCipherKeysForDomain(Reference const> db, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName, + BlobCipherMetrics::UsageType usageType) { + std::unordered_map domains; + domains[domainId] = domainName; + domains[ENCRYPT_HEADER_DOMAIN_ID] = FDB_ENCRYPT_HEADER_DOMAIN_NAME; std::unordered_map> cipherKeys = - wait(getLatestEncryptCipherKeys(db, domains)); - ASSERT(cipherKeys.count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) > 0); + wait(getLatestEncryptCipherKeys(db, domains, usageType)); + ASSERT(cipherKeys.count(domainId) > 0); ASSERT(cipherKeys.count(ENCRYPT_HEADER_DOMAIN_ID) > 0); - TextAndHeaderCipherKeys result{ cipherKeys.at(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID), - cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) }; + TextAndHeaderCipherKeys result{ cipherKeys.at(domainId), cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) }; ASSERT(result.cipherTextKey.isValid()); ASSERT(result.cipherHeaderKey.isValid()); return result; } -ACTOR Future getEncryptCipherKeys(Reference const> db, - BlobCipherEncryptHeader header) { +template +Future getLatestSystemEncryptCipherKeys(const Reference const>& db, + BlobCipherMetrics::UsageType usageType) { + return getLatestEncryptCipherKeysForDomain( + db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME, usageType); +} + +ACTOR template +Future getEncryptCipherKeys(Reference const> db, + BlobCipherEncryptHeader header, + BlobCipherMetrics::UsageType usageType) { std::unordered_set cipherDetails{ header.cipherTextDetails, header.cipherHeaderDetails }; std::unordered_map> cipherKeys = - wait(getEncryptCipherKeys(db, cipherDetails)); + wait(getEncryptCipherKeys(db, cipherDetails, usageType)); ASSERT(cipherKeys.count(header.cipherTextDetails) > 0); ASSERT(cipherKeys.count(header.cipherHeaderDetails) > 0); TextAndHeaderCipherKeys result{ cipherKeys.at(header.cipherTextDetails), @@ -256,3 +317,6 @@ ACTOR Future getEncryptCipherKeys(Reference migrate(GlobalConfig* self); - ACTOR static Future refresh(GlobalConfig* self); + ACTOR static Future refresh(GlobalConfig* self, Version lastKnown); ACTOR static Future updater(GlobalConfig* self, const ClientDBInfo* dbInfo); DatabaseContext* cx; diff --git a/fdbclient/include/fdbclient/GrvProxyInterface.h b/fdbclient/include/fdbclient/GrvProxyInterface.h index 4098a88d2c..5d1cec15b2 100644 --- a/fdbclient/include/fdbclient/GrvProxyInterface.h +++ b/fdbclient/include/fdbclient/GrvProxyInterface.h @@ -1,4 +1,3 @@ - /* * GrvProxyInterface.h * @@ -26,8 +25,9 @@ #include "fdbrpc/fdbrpc.h" #include "fdbclient/FDBTypes.h" -// GrvProxy is proxy primarily specializing on serving GetReadVersion. It also serves health metrics since it -// communicates with RateKeeper to gather health information of the cluster. +// GrvProxy is proxy primarily specializing on serving GetReadVersion. It also +// serves health metrics since it communicates with RateKeeper to gather health +// information of the cluster, and handles proxied GlobalConfig requests. struct GrvProxyInterface { constexpr static FileIdentifier file_identifier = 8743216; enum { LocationAwareLoadBalance = 1 }; @@ -43,6 +43,7 @@ struct GrvProxyInterface { // committed) RequestStream> waitFailure; // reports heartbeat to master. RequestStream getHealthMetrics; + RequestStream refreshGlobalConfig; UID id() const { return getConsistentReadVersion.getEndpoint().token; } std::string toString() const { return id().shortString(); } @@ -59,6 +60,8 @@ struct GrvProxyInterface { RequestStream>(getConsistentReadVersion.getEndpoint().getAdjustedEndpoint(1)); getHealthMetrics = RequestStream( getConsistentReadVersion.getEndpoint().getAdjustedEndpoint(2)); + refreshGlobalConfig = RequestStream( + getConsistentReadVersion.getEndpoint().getAdjustedEndpoint(3)); } } @@ -67,6 +70,7 @@ struct GrvProxyInterface { streams.push_back(getConsistentReadVersion.getReceiver(TaskPriority::ReadSocket)); streams.push_back(waitFailure.getReceiver()); streams.push_back(getHealthMetrics.getReceiver()); + streams.push_back(refreshGlobalConfig.getReceiver()); FlowTransport::transport().addEndpoints(streams); } }; diff --git a/fdbclient/include/fdbclient/HighContentionPrefixAllocator.actor.h b/fdbclient/include/fdbclient/HighContentionPrefixAllocator.actor.h index 57e185cd78..20ec964436 100644 --- a/fdbclient/include/fdbclient/HighContentionPrefixAllocator.actor.h +++ b/fdbclient/include/fdbclient/HighContentionPrefixAllocator.actor.h @@ -134,7 +134,7 @@ private: if (!candidateValueFuture.get().present()) { tr->addWriteConflictRange(singleKeyRange(self->recent.get(candidate).key())); - return Tuple().append(candidate).pack(); + return Tuple::makeTuple(candidate).pack(); } } } diff --git a/fdbclient/include/fdbclient/IClientApi.h b/fdbclient/include/fdbclient/IClientApi.h index 73e743d060..10d777a537 100644 --- a/fdbclient/include/fdbclient/IClientApi.h +++ b/fdbclient/include/fdbclient/IClientApi.h @@ -22,6 +22,7 @@ #define FDBCLIENT_ICLIENTAPI_H #pragma once +#include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/Tenant.h" @@ -78,13 +79,30 @@ public: virtual ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) = 0; - virtual ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) = 0; + virtual ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rowLimit) = 0; virtual ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) = 0; + virtual ThreadFuture>> readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) = 0; + + virtual ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) = 0; + + virtual ThreadFuture>> + summarizeBlobGranules(const KeyRangeRef& keyRange, Optional summaryVersion, int rangeLimit) = 0; + virtual void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) = 0; virtual void set(const KeyRef& key, const ValueRef& value) = 0; virtual void clear(const KeyRef& begin, const KeyRef& end) = 0; @@ -172,6 +190,13 @@ public: virtual ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0; virtual ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0; + virtual ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) = 0; + virtual ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) = 0; + virtual ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) = 0; + + virtual ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) = 0; + // Interface to manage shared state across multiple connections to the same Database virtual ThreadFuture createSharedState() = 0; virtual void setSharedState(DatabaseSharedState* p) = 0; @@ -190,6 +215,7 @@ public: virtual void selectApiVersion(int apiVersion) = 0; virtual const char* getClientVersion() = 0; + virtual void useFutureProtocolVersion() = 0; virtual void setNetworkOption(FDBNetworkOptions::Option option, Optional value = Optional()) = 0; diff --git a/fdbclient/include/fdbclient/IConfigTransaction.h b/fdbclient/include/fdbclient/IConfigTransaction.h index 8f21679e27..8451e86f21 100644 --- a/fdbclient/include/fdbclient/IConfigTransaction.h +++ b/fdbclient/include/fdbclient/IConfigTransaction.h @@ -55,7 +55,7 @@ public: Future>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) override { throw client_invalid_operation(); } - Future>> getBlobGranuleRanges(KeyRange const& range) override { + Future>> getBlobGranuleRanges(KeyRange const& range, int rowLimit) override { throw client_invalid_operation(); } Future>> readBlobGranules(KeyRange const& range, @@ -64,7 +64,13 @@ public: Version* readVersionOut) override { throw client_invalid_operation(); } + Future>> summarizeBlobGranules(KeyRange const& range, + Optional readVersion, + int rangeLimit) override { + throw client_invalid_operation(); + } Future getEstimatedRangeSizeBytes(KeyRange const& keys) override { throw client_invalid_operation(); } + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) override { throw client_invalid_operation(); } void addReadConflictRange(KeyRangeRef const& keys) override { throw client_invalid_operation(); } void makeSelfConflicting() override { throw client_invalid_operation(); } void atomicOp(KeyRef const& key, ValueRef const& operand, uint32_t operationType) override { diff --git a/fdbclient/include/fdbclient/ISingleThreadTransaction.h b/fdbclient/include/fdbclient/ISingleThreadTransaction.h index b44f58b464..71a0897693 100644 --- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h @@ -80,11 +80,15 @@ public: virtual Future>> getAddressesForKey(Key const& key) = 0; virtual Future>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) = 0; virtual Future getEstimatedRangeSizeBytes(KeyRange const& keys) = 0; - virtual Future>> getBlobGranuleRanges(KeyRange const& range) = 0; + virtual Future>> getBlobGranuleRanges(KeyRange const& range, int rangeLimit) = 0; virtual Future>> readBlobGranules(KeyRange const& range, Version begin, Optional readVersion, Version* readVersionOut = nullptr) = 0; + virtual Future>> summarizeBlobGranules(KeyRange const& range, + Optional summaryVersion, + int rangeLimit) = 0; + virtual void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) = 0; virtual void addReadConflictRange(KeyRangeRef const& keys) = 0; virtual void makeSelfConflicting() = 0; virtual void atomicOp(KeyRef const& key, ValueRef const& operand, uint32_t operationType) = 0; diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h index 98171671ee..a3fee57644 100644 --- a/fdbclient/include/fdbclient/KeyBackedTypes.h +++ b/fdbclient/include/fdbclient/KeyBackedTypes.h @@ -23,9 +23,10 @@ #include #include +#include "fdbclient/ClientBooleanParams.h" +#include "fdbclient/CommitTransaction.h" +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/GenericTransactionHelper.h" -#include "fdbclient/IClientApi.h" -#include "fdbclient/DatabaseContext.h" #include "fdbclient/Subspace.h" #include "flow/ObjectSerializer.h" #include "flow/genericactors.actor.h" @@ -59,7 +60,7 @@ inline Tuple TupleCodec::unpack(Standalone const& val) { template <> inline Standalone TupleCodec::pack(int64_t const& val) { - return Tuple().append(val).pack(); + return Tuple::makeTuple(val).pack(); } template <> inline int64_t TupleCodec::unpack(Standalone const& val) { @@ -68,7 +69,7 @@ inline int64_t TupleCodec::unpack(Standalone const& val) { template <> inline Standalone TupleCodec::pack(bool const& val) { - return Tuple().append(val ? 1 : 0).pack(); + return Tuple::makeTuple(val ? 1 : 0).pack(); } template <> inline bool TupleCodec::unpack(Standalone const& val) { @@ -77,7 +78,7 @@ inline bool TupleCodec::unpack(Standalone const& val) { template <> inline Standalone TupleCodec>::pack(Standalone const& val) { - return Tuple().append(val).pack(); + return Tuple::makeTuple(val).pack(); } template <> inline Standalone TupleCodec>::unpack(Standalone const& val) { @@ -96,7 +97,7 @@ inline UID TupleCodec::unpack(Standalone const& val) { // This is backward compatible with TupleCodec> template <> inline Standalone TupleCodec::pack(std::string const& val) { - return Tuple().append(StringRef(val)).pack(); + return Tuple::makeTuple(val).pack(); } template <> inline std::string TupleCodec::unpack(Standalone const& val) { @@ -143,7 +144,7 @@ struct TupleCodec> { template <> inline Standalone TupleCodec::pack(KeyRange const& val) { - return Tuple().append(val.begin).append(val.end).pack(); + return Tuple::makeTuple(val.begin, val.end).pack(); } template <> inline KeyRange TupleCodec::unpack(Standalone const& val) { @@ -151,6 +152,23 @@ inline KeyRange TupleCodec::unpack(Standalone const& val) { return KeyRangeRef(t.getString(0), t.getString(1)); } +struct NullCodec { + static Standalone pack(Standalone val) { return val; } + static Standalone unpack(Standalone val) { return val; } +}; + +template +struct BinaryCodec { + static Standalone pack(T val) { return BinaryWriter::toValue(val, Unversioned()); } + static T unpack(Standalone val) { return BinaryReader::fromStringRef(val, Unversioned()); } +}; + +template +struct KeyBackedRangeResult { + std::vector results; + bool more; +}; + // Convenient read/write access to a single value of type T stored at key // Even though 'this' is not actually mutated, methods that change the db key are not const. template > @@ -230,7 +248,7 @@ public: template typename std::enable_if, void>::type set(Transaction tr, T const& val) { - return tr->set(key, Codec::pack(val)); + tr->set(key, Codec::pack(val)); } template @@ -245,7 +263,7 @@ public: template typename std::enable_if, void>::type clear(Transaction tr) { - return tr->clear(key); + tr->clear(key); } Key key; @@ -279,17 +297,17 @@ public: template void set(Transaction tr, T const& val) { - return tr->set(key, BinaryWriter::toValue(val, Unversioned())); + tr->set(key, BinaryWriter::toValue(val, Unversioned())); } template void atomicOp(Transaction tr, T const& val, MutationRef::Type type) { - return tr->atomicOp(key, BinaryWriter::toValue(val, Unversioned()), type); + tr->atomicOp(key, BinaryWriter::toValue(val, Unversioned()), type); } template void clear(Transaction tr) { - return tr->clear(key); + tr->clear(key); } Key key; @@ -308,16 +326,16 @@ public: typedef _KeyType KeyType; typedef _ValueType ValueType; typedef std::pair PairType; - typedef std::vector PairsType; + typedef KeyBackedRangeResult RangeResultType; // If end is not present one key past the end of the map is used. template - Future getRange(Transaction tr, - Optional const& begin, - Optional const& end, - int limit, - Snapshot snapshot = Snapshot::False, - Reverse reverse = Reverse::False) const { + Future getRange(Transaction tr, + Optional const& begin, + Optional const& end, + int limit, + Snapshot snapshot = Snapshot::False, + Reverse reverse = Reverse::False) const { Key prefix = subspace.begin; // 'this' could be invalid inside lambda Key beginKey = begin.present() ? prefix.withSuffix(KeyCodec::pack(begin.get())) : subspace.begin; @@ -326,16 +344,18 @@ public: typename transaction_future_type::type getRangeFuture = tr->getRange(KeyRangeRef(beginKey, endKey), GetRangeLimits(limit), snapshot, reverse); - return holdWhile(getRangeFuture, - map(safeThreadFutureToFuture(getRangeFuture), [prefix](RangeResult const& kvs) -> PairsType { - PairsType results; - for (int i = 0; i < kvs.size(); ++i) { - KeyType key = KeyCodec::unpack(kvs[i].key.removePrefix(prefix)); - ValueType val = ValueCodec::unpack(kvs[i].value); - results.push_back(PairType(key, val)); - } - return results; - })); + return holdWhile( + getRangeFuture, + map(safeThreadFutureToFuture(getRangeFuture), [prefix](RangeResult const& kvs) -> RangeResultType { + RangeResultType rangeResult; + for (int i = 0; i < kvs.size(); ++i) { + KeyType key = KeyCodec::unpack(kvs[i].key.removePrefix(prefix)); + ValueType val = ValueCodec::unpack(kvs[i].value); + rangeResult.results.push_back(PairType(key, val)); + } + rangeResult.more = kvs.more; + return rangeResult; + })); } template @@ -351,6 +371,16 @@ public: })); } + // Get key's value or defaultValue if it doesn't exist + template + Future getD(Transaction tr, + KeyType const& key, + Snapshot snapshot = Snapshot::False, + ValueType defaultValue = ValueType()) const { + return map(get(tr, key, snapshot), + [=](Optional val) -> ValueType { return val.orDefault(defaultValue); }); + } + // Returns a Property that can be get/set that represents key's entry in this this. KeyBackedProperty getProperty(KeyType const& key) const { return subspace.begin.withSuffix(KeyCodec::pack(key)); @@ -365,20 +395,27 @@ public: return k.expectedSize() + v.expectedSize(); } + template + void atomicOp(Transaction tr, KeyType const& key, ValueType const& val, MutationRef::Type type) { + Key k = subspace.begin.withSuffix(KeyCodec::pack(key)); + Value v = ValueCodec::pack(val); + tr->atomicOp(k, v, type); + } + template void erase(Transaction tr, KeyType const& key) { - return tr->clear(subspace.begin.withSuffix(KeyCodec::pack(key))); + tr->clear(subspace.begin.withSuffix(KeyCodec::pack(key))); } template void erase(Transaction tr, KeyType const& begin, KeyType const& end) { - return tr->clear(KeyRangeRef(subspace.begin.withSuffix(KeyCodec::pack(begin)), - subspace.begin.withSuffix(KeyCodec::pack(end)))); + tr->clear(KeyRangeRef(subspace.begin.withSuffix(KeyCodec::pack(begin)), + subspace.begin.withSuffix(KeyCodec::pack(end)))); } template void clear(Transaction tr) { - return tr->clear(subspace); + tr->clear(subspace); } KeyRange subspace; @@ -463,7 +500,7 @@ public: template typename std::enable_if, void>::type set(Transaction tr, T const& val) { - return tr->set(key, ObjectWriter::toValue(val, versionOptions)); + tr->set(key, ObjectWriter::toValue(val, versionOptions)); } template @@ -478,7 +515,7 @@ public: template typename std::enable_if, void>::type clear(Transaction tr) { - return tr->clear(key); + tr->clear(key); } Key key; @@ -497,15 +534,15 @@ public: typedef _KeyType KeyType; typedef _ValueType ValueType; typedef std::pair PairType; - typedef std::vector PairsType; + typedef KeyBackedRangeResult RangeResultType; template - Future getRange(Transaction tr, - Optional const& begin, - Optional const& end, - int limit, - Snapshot snapshot = Snapshot::False, - Reverse reverse = Reverse::False) const { + Future getRange(Transaction tr, + Optional const& begin, + Optional const& end, + int limit, + Snapshot snapshot = Snapshot::False, + Reverse reverse = Reverse::False) const { Key beginKey = begin.present() ? subspace.begin.withSuffix(KeyCodec::pack(begin.get())) : subspace.begin; Key endKey = end.present() ? subspace.begin.withSuffix(KeyCodec::pack(end.get())) : subspace.end; @@ -514,14 +551,15 @@ public: return holdWhile( getRangeFuture, - map(safeThreadFutureToFuture(getRangeFuture), [self = *this](RangeResult const& kvs) -> PairsType { - PairsType results; + map(safeThreadFutureToFuture(getRangeFuture), [self = *this](RangeResult const& kvs) -> RangeResultType { + RangeResultType rangeResult; for (int i = 0; i < kvs.size(); ++i) { KeyType key = KeyCodec::unpack(kvs[i].key.removePrefix(self.subspace.begin)); ValueType val = ObjectReader::fromStringRef(kvs[i].value, self.versionOptions); - results.push_back(PairType(key, val)); + rangeResult.results.push_back(PairType(key, val)); } - return results; + rangeResult.more = kvs.more; + return rangeResult; })); } @@ -560,18 +598,18 @@ public: template void erase(Transaction tr, KeyType const& key) { - return tr->clear(subspace.begin.withSuffix(KeyCodec::pack(key))); + tr->clear(subspace.begin.withSuffix(KeyCodec::pack(key))); } template void erase(Transaction tr, KeyType const& begin, KeyType const& end) { - return tr->clear(KeyRangeRef(subspace.begin.withSuffix(KeyCodec::pack(begin)), - subspace.begin.withSuffix(KeyCodec::pack(end)))); + tr->clear(KeyRangeRef(subspace.begin.withSuffix(KeyCodec::pack(begin)), + subspace.begin.withSuffix(KeyCodec::pack(end)))); } template void clear(Transaction tr) { - return tr->clear(subspace); + tr->clear(subspace); } KeyRange subspace; @@ -584,15 +622,15 @@ public: KeyBackedSet(KeyRef key) : subspace(prefixRange(key)) {} typedef _ValueType ValueType; - typedef std::vector Values; + typedef KeyBackedRangeResult RangeResultType; template - Future getRange(Transaction tr, - Optional const& begin, - Optional const& end, - int limit, - Snapshot snapshot = Snapshot::False, - Reverse reverse = Reverse::False) const { + Future getRange(Transaction tr, + Optional const& begin, + Optional const& end, + int limit, + Snapshot snapshot = Snapshot::False, + Reverse reverse = Reverse::False) const { Key prefix = subspace.begin; // 'this' could be invalid inside lambda Key beginKey = begin.present() ? prefix.withSuffix(Codec::pack(begin.get())) : subspace.begin; Key endKey = end.present() ? prefix.withSuffix(Codec::pack(end.get())) : subspace.end; @@ -600,14 +638,16 @@ public: typename transaction_future_type::type getRangeFuture = tr->getRange(KeyRangeRef(beginKey, endKey), GetRangeLimits(limit), snapshot, reverse); - return holdWhile(getRangeFuture, - map(safeThreadFutureToFuture(getRangeFuture), [prefix](RangeResult const& kvs) -> Values { - Values results; - for (int i = 0; i < kvs.size(); ++i) { - results.push_back(Codec::unpack(kvs[i].key.removePrefix(prefix))); - } - return results; - })); + return holdWhile( + getRangeFuture, + map(safeThreadFutureToFuture(getRangeFuture), [prefix](RangeResult const& kvs) -> RangeResultType { + RangeResultType rangeResult; + for (int i = 0; i < kvs.size(); ++i) { + rangeResult.results.push_back(Codec::unpack(kvs[i].key.removePrefix(prefix))); + } + rangeResult.more = kvs.more; + return rangeResult; + })); } template @@ -630,18 +670,18 @@ public: template void erase(Transaction tr, ValueType const& val) { - return tr->clear(subspace.begin.withSuffix(Codec::pack(val))); + tr->clear(subspace.begin.withSuffix(Codec::pack(val))); } template void erase(Transaction tr, ValueType const& begin, ValueType const& end) { - return tr->clear( + tr->clear( KeyRangeRef(subspace.begin.withSuffix(Codec::pack(begin)), subspace.begin.withSuffix(Codec::pack(end)))); } template void clear(Transaction tr) { - return tr->clear(subspace); + tr->clear(subspace); } KeyRange subspace; diff --git a/fdbclient/include/fdbclient/KeyRangeMap.h b/fdbclient/include/fdbclient/KeyRangeMap.h index 88cce027a8..f88dc72dda 100644 --- a/fdbclient/include/fdbclient/KeyRangeMap.h +++ b/fdbclient/include/fdbclient/KeyRangeMap.h @@ -136,6 +136,16 @@ Future krmGetRanges(Reference const& tr, KeyRange const& keys, int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); +Future krmGetRangesUnaligned(Transaction* const& tr, + Key const& mapPrefix, + KeyRange const& keys, + int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, + int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); +Future krmGetRangesUnaligned(Reference const& tr, + Key const& mapPrefix, + KeyRange const& keys, + int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, + int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); void krmSetPreviouslyEmptyRange(Transaction* tr, const KeyRef& mapPrefix, const KeyRangeRef& keys, @@ -162,7 +172,7 @@ Future krmSetRangeCoalescing(Reference const& t KeyRange const& range, KeyRange const& maxRange, Value const& value); -RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv); +RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align = true); template std::vector> KeyRangeMap::getAffectedRangesAfterInsertion( diff --git a/fdbclient/include/fdbclient/ManagementAPI.actor.h b/fdbclient/include/fdbclient/ManagementAPI.actor.h index 63f56242f7..c0725324c8 100644 --- a/fdbclient/include/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h @@ -57,7 +57,8 @@ struct IQuorumChange : ReferenceCounted { // Change to use the given set of coordination servers ACTOR Future> changeQuorumChecker(Transaction* tr, ClusterConnectionString* conn, - std::string newName); + std::string newName, + bool disableConfigDB); ACTOR Future changeQuorum(Database cx, Reference change); Reference autoQuorumChange(int desired = -1); Reference nameQuorumChange(std::string const& name, Reference const& other); @@ -159,5 +160,9 @@ bool schemaMatch(json_spirit::mValue const& schema, // storage nodes ACTOR Future mgmtSnapCreate(Database cx, Standalone snapCmd, UID snapUID); +// Set and get the storage quota per tenant +void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota); +ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/Metacluster.h b/fdbclient/include/fdbclient/Metacluster.h new file mode 100644 index 0000000000..7f07286ae4 --- /dev/null +++ b/fdbclient/include/fdbclient/Metacluster.h @@ -0,0 +1,183 @@ +/* + * Metacluster.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_METACLUSTER_H +#define FDBCLIENT_METACLUSTER_H +#include "fdbclient/CoordinationInterface.h" +#include "json_spirit/json_spirit_value.h" +#pragma once + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/KeyBackedTypes.h" +#include "flow/flat_buffers.h" + +struct ClusterUsage { + int numTenantGroups = 0; + + ClusterUsage() = default; + ClusterUsage(int numTenantGroups) : numTenantGroups(numTenantGroups) {} + + json_spirit::mObject toJson() const; + + bool operator==(const ClusterUsage& other) const noexcept { return numTenantGroups == other.numTenantGroups; } + bool operator!=(const ClusterUsage& other) const noexcept { return !(*this == other); } + bool operator<(const ClusterUsage& other) const noexcept { return numTenantGroups < other.numTenantGroups; } + + template + void serialize(Ar& ar) { + serializer(ar, numTenantGroups); + } +}; + +template <> +struct Traceable : std::true_type { + static std::string toString(const ClusterUsage& value) { + return format("NumTenantGroups: %d", value.numTenantGroups); + } +}; + +// Represents the various states that a data cluster could be in. +// +// READY - the data cluster is active +// REMOVING - the data cluster is being removed and cannot have its configuration changed or any tenants created +// RESTORING - the data cluster is being restored and cannot have its configuration changed or any tenants +// created/updated/deleted. +enum class DataClusterState { READY, REMOVING, RESTORING }; + +struct DataClusterEntry { + constexpr static FileIdentifier file_identifier = 929511; + + static std::string clusterStateToString(DataClusterState clusterState); + static DataClusterState stringToClusterState(std::string stateStr); + + UID id; + ClusterUsage capacity; + ClusterUsage allocated; + + DataClusterState clusterState = DataClusterState::READY; + + DataClusterEntry() = default; + DataClusterEntry(ClusterUsage capacity) : capacity(capacity) {} + DataClusterEntry(UID id, ClusterUsage capacity, ClusterUsage allocated) + : id(id), capacity(capacity), allocated(allocated) {} + + // Returns true if all configurable properties match + bool matchesConfiguration(DataClusterEntry const& other) const { + return id == other.id && capacity == other.capacity; + } + + bool hasCapacity() const { return allocated < capacity; } + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } + static DataClusterEntry decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + + json_spirit::mObject toJson() const; + + template + void serialize(Ar& ar) { + serializer(ar, id, capacity, allocated, clusterState); + } +}; + +struct MetaclusterRegistrationEntry { + constexpr static FileIdentifier file_identifier = 13448589; + + ClusterType clusterType; + + ClusterName metaclusterName; + ClusterName name; + UID metaclusterId; + UID id; + + MetaclusterRegistrationEntry() = default; + MetaclusterRegistrationEntry(ClusterName metaclusterName, UID metaclusterId) + : clusterType(ClusterType::METACLUSTER_MANAGEMENT), metaclusterName(metaclusterName), name(metaclusterName), + metaclusterId(metaclusterId), id(metaclusterId) {} + MetaclusterRegistrationEntry(ClusterName metaclusterName, ClusterName name, UID metaclusterId, UID id) + : clusterType(ClusterType::METACLUSTER_DATA), metaclusterName(metaclusterName), name(name), + metaclusterId(metaclusterId), id(id) { + ASSERT(metaclusterName != name && metaclusterId != id); + } + + // Returns true if this entry is associated with the same cluster as the passed in entry. If one entry is from the + // management cluster and the other is from a data cluster, this checks whether they are part of the same + // metacluster. + bool matches(MetaclusterRegistrationEntry const& other) const { + if (metaclusterName != other.metaclusterName || metaclusterId != other.metaclusterId) { + return false; + } else if (clusterType == ClusterType::METACLUSTER_DATA && other.clusterType == ClusterType::METACLUSTER_DATA && + (name != other.name || id != other.id)) { + return false; + } + + return true; + } + + MetaclusterRegistrationEntry toManagementClusterRegistration() const { + ASSERT(clusterType == ClusterType::METACLUSTER_DATA); + return MetaclusterRegistrationEntry(metaclusterName, metaclusterId); + } + + MetaclusterRegistrationEntry toDataClusterRegistration(ClusterName name, UID id) const { + ASSERT(clusterType == ClusterType::METACLUSTER_MANAGEMENT); + return MetaclusterRegistrationEntry(metaclusterName, name, metaclusterId, id); + } + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } + static MetaclusterRegistrationEntry decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + static Optional decode(Optional value) { + return value.map( + [](ValueRef const& v) { return MetaclusterRegistrationEntry::decode(v); }); + } + + std::string toString() const { + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + return fmt::format( + "metacluster name: {}, metacluster id: {}", printable(metaclusterName), metaclusterId.shortString()); + } else { + return fmt::format("metacluster name: {}, metacluster id: {}, data cluster name: {}, data cluster id: {}", + printable(metaclusterName), + metaclusterId.shortString(), + printable(name), + id.shortString()); + } + } + + template + void serialize(Ar& ar) { + serializer(ar, clusterType, metaclusterName, name, metaclusterId, id); + } +}; + +template <> +struct Traceable : std::true_type { + static std::string toString(MetaclusterRegistrationEntry const& entry) { return entry.toString(); } +}; + +struct MetaclusterMetadata { + // Registration information for a metacluster, stored on both management and data clusters + static KeyBackedObjectProperty& metaclusterRegistration(); +}; + +#endif diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h new file mode 100644 index 0000000000..b13257f64e --- /dev/null +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -0,0 +1,2020 @@ +/* + * MetaclusterManagement.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "fdbclient/FDBOptions.g.h" +#include "flow/IRandom.h" +#include "flow/ThreadHelper.actor.h" +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H) +#define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H +#include "fdbclient/MetaclusterManagement.actor.g.h" +#elif !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H) +#define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/GenericTransactionHelper.h" +#include "fdbclient/GenericManagementAPI.actor.h" +#include "fdbclient/KeyBackedTypes.h" +#include "fdbclient/Metacluster.h" +#include "fdbclient/MultiVersionTransaction.h" +#include "fdbclient/SystemData.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/VersionedMap.h" +#include "flow/flat_buffers.h" +#include "flow/actorcompiler.h" // has to be last include + +// This file provides the interfaces to manage metacluster metadata. +// +// These transactions can operate on clusters at different versions, so care needs to be taken to update the metadata +// according to the cluster version. +// +// Support is maintained in this file for the current and the previous protocol versions. + +struct DataClusterMetadata { + constexpr static FileIdentifier file_identifier = 5573993; + + DataClusterEntry entry; + ClusterConnectionString connectionString; + + DataClusterMetadata() = default; + DataClusterMetadata(DataClusterEntry const& entry, ClusterConnectionString const& connectionString) + : entry(entry), connectionString(connectionString) {} + + bool matchesConfiguration(DataClusterMetadata const& other) const { + return entry.matchesConfiguration(other.entry) && connectionString == other.connectionString; + } + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } + static DataClusterMetadata decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + + json_spirit::mValue toJson() const { + json_spirit::mObject obj = entry.toJson(); + obj["connection_string"] = connectionString.toString(); + return obj; + } + + template + void serialize(Ar& ar) { + serializer(ar, connectionString, entry); + } +}; + +FDB_DECLARE_BOOLEAN_PARAM(AddNewTenants); +FDB_DECLARE_BOOLEAN_PARAM(RemoveMissingTenants); + +namespace MetaclusterAPI { + +struct ManagementClusterMetadata { + struct ConnectionStringCodec { + static inline Standalone pack(ClusterConnectionString const& val) { + return StringRef(val.toString()); + } + static inline ClusterConnectionString unpack(Standalone const& val) { + return ClusterConnectionString(val.toString()); + } + }; + + static TenantMetadataSpecification& tenantMetadata(); + + // A map from cluster name to the metadata associated with a cluster + static KeyBackedObjectMap& dataClusters(); + + // A map from cluster name to the connection string for the cluster + static KeyBackedMap, ConnectionStringCodec> + dataClusterConnectionRecords; + + // A set of non-full clusters where the key is the tuple (num tenant groups allocated, cluster name). + static KeyBackedSet clusterCapacityIndex; + + // A map from cluster name to a count of tenants + static KeyBackedMap, BinaryCodec> clusterTenantCount; + + // A set of (cluster name, tenant name, tenant ID) tuples ordered by cluster + static KeyBackedSet clusterTenantIndex; + + // A set of (cluster, tenant group name) tuples ordered by cluster + static KeyBackedSet clusterTenantGroupIndex; +}; + +ACTOR Future> openDatabase(ClusterConnectionString connectionString); + +ACTOR template +Future> tryGetClusterTransaction(Transaction tr, ClusterName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + state Future metaclusterRegistrationCheck = + TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT); + + state Future> clusterEntryFuture = + ManagementClusterMetadata::dataClusters().get(tr, name); + state Future> connectionRecordFuture = + ManagementClusterMetadata::dataClusterConnectionRecords.get(tr, name); + + wait(metaclusterRegistrationCheck); + + state Optional clusterEntry = wait(clusterEntryFuture); + Optional connectionString = wait(connectionRecordFuture); + + if (clusterEntry.present()) { + ASSERT(connectionString.present()); + return Optional(DataClusterMetadata(clusterEntry.get(), connectionString.get())); + } else { + return Optional(); + } +} + +ACTOR template +Future> tryGetCluster(Reference db, ClusterName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional metadata = wait(tryGetClusterTransaction(tr, name)); + return metadata; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future getClusterTransaction(Transaction tr, ClusterNameRef name) { + Optional metadata = wait(tryGetClusterTransaction(tr, name)); + if (!metadata.present()) { + throw cluster_not_found(); + } + + return metadata.get(); +} + +ACTOR template +Future getCluster(Reference db, ClusterName name) { + Optional metadata = wait(tryGetCluster(db, name)); + if (!metadata.present()) { + throw cluster_not_found(); + } + + return metadata.get(); +} + +ACTOR template +Future> getAndOpenDatabase(Transaction managementTr, ClusterName clusterName) { + DataClusterMetadata clusterMetadata = wait(getClusterTransaction(managementTr, clusterName)); + Reference db = wait(openDatabase(clusterMetadata.connectionString)); + return db; +} + +template +struct MetaclusterOperationContext { + Reference managementDb; + Reference dataClusterDb; + + Optional clusterName; + + Optional metaclusterRegistration; + Optional dataClusterMetadata; + + MetaclusterOperationContext(Reference managementDb, Optional clusterName = {}) + : managementDb(managementDb), clusterName(clusterName) {} + + // Run a transaction on the management cluster. This verifies that the cluster is a management cluster and matches + // the same metacluster that we've run any previous transactions on. If a clusterName is set, it also verifies that + // the specified cluster is present. Stores the metaclusterRegistration entry and, if a clusterName is set, the + // dataClusterMetadata and dataClusterDb in the context. + ACTOR template + static Future()(Reference()).getValue())> + runManagementTransaction(MetaclusterOperationContext* self, Function func) { + state Reference tr = self->managementDb->createTransaction(); + state bool clusterPresentAtStart = self->clusterName.present(); + loop { + try { + // If this transaction is retrying and didn't have the cluster name set at the beginning, clear it out + // to be set again in the next iteration. + if (!clusterPresentAtStart) { + self->clearCluster(); + } + + // Get the data cluster metadata for the specified cluster, if present + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Future> dataClusterMetadataFuture; + if (self->clusterName.present()) { + dataClusterMetadataFuture = tryGetClusterTransaction(tr, self->clusterName.get()); + } + + // Get the metacluster registration information + state Optional currentMetaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + state Optional currentDataClusterMetadata; + if (self->clusterName.present()) { + wait(store(currentDataClusterMetadata, dataClusterMetadataFuture)); + } + + // Check that this is a management cluster and is the same metacluster that any previous transactions + // have run on. + if (!currentMetaclusterRegistration.present() || + currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_MANAGEMENT) { + throw invalid_metacluster_operation(); + } else if (self->metaclusterRegistration.present() && + !self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { + throw invalid_metacluster_operation(); + } + + // If a cluster was specified, check that the cluster metadata is present. If so, load it and store it + // in the context. Additionally, store the data cluster details in the local metacluster registration + // entry. + if (self->clusterName.present()) { + if (!currentDataClusterMetadata.present()) { + throw cluster_not_found(); + } else { + currentMetaclusterRegistration = currentMetaclusterRegistration.get().toDataClusterRegistration( + self->clusterName.get(), currentDataClusterMetadata.get().entry.id); + } + } + + // Store the metacluster registration entry + if (!self->metaclusterRegistration.present()) { + self->metaclusterRegistration = currentMetaclusterRegistration; + } + + // Check that our data cluster has the same ID as previous transactions. If so, then store the updated + // cluster metadata in the context and open a connection to the data DB. + if (self->dataClusterMetadata.present() && + self->dataClusterMetadata.get().entry.id != currentDataClusterMetadata.get().entry.id) { + throw cluster_not_found(); + } else if (self->clusterName.present()) { + self->dataClusterMetadata = currentDataClusterMetadata; + if (!self->dataClusterDb) { + wait( + store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString))); + } + } + + state decltype(std::declval()(Reference()).getValue()) result = + wait(func(tr)); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + return result; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + template + Future()(Reference()).getValue())> + runManagementTransaction(Function func) { + return runManagementTransaction(this, func); + } + + // Runs a transaction on the data cluster. This requires that a cluster name be set and that a transaction has + // already been run on the management cluster to populate the needed metadata. This verifies that the data cluster + // has the expected ID and is part of the metacluster that previous transactions have run on. + ACTOR template + static Future()(Reference()).getValue())> + runDataClusterTransaction(MetaclusterOperationContext* self, Function func) { + ASSERT(self->dataClusterDb); + ASSERT(self->dataClusterMetadata.present()); + ASSERT(self->metaclusterRegistration.present() && + self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + + state Reference tr = self->dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Optional currentMetaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + // Check that this is the expected data cluster and is part of the right metacluster + if (!currentMetaclusterRegistration.present() || + currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) { + throw invalid_metacluster_operation(); + } else if (!self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { + throw invalid_metacluster_operation(); + } + + state decltype(std::declval()(Reference()).getValue()) result = + wait(func(tr)); + + wait(safeThreadFutureToFuture(tr->commit())); + return result; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + template + Future()(Reference()).getValue())> + runDataClusterTransaction(Function func) { + return runDataClusterTransaction(this, func); + } + + ACTOR static Future updateClusterName(MetaclusterOperationContext* self, + Reference tr) { + state DataClusterMetadata currentDataClusterMetadata = wait(getClusterTransaction(tr, self->clusterName.get())); + + self->metaclusterRegistration = self->metaclusterRegistration.get().toDataClusterRegistration( + self->clusterName.get(), currentDataClusterMetadata.entry.id); + + self->dataClusterMetadata = currentDataClusterMetadata; + if (!self->dataClusterDb) { + wait(store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString))); + } + + return Void(); + } + + // Sets the cluster used in this context. This must be called from a management cluster transaction, and it + // will load the cluster metadata and connect to the cluster. + Future setCluster(Reference tr, ClusterName clusterName) { + ASSERT(!this->clusterName.present()); + ASSERT(!dataClusterMetadata.present()); + ASSERT(metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_MANAGEMENT); + this->clusterName = clusterName; + return updateClusterName(this, tr); + } + + // Clears the chosen cluster for this context. This is useful if we are retrying a transaction that expects an + // uninitialized cluster. + void clearCluster() { + clusterName = {}; + dataClusterMetadata = {}; + dataClusterDb = {}; + if (metaclusterRegistration.present() && + metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA) { + metaclusterRegistration = metaclusterRegistration.get().toManagementClusterRegistration(); + } + } +}; + +template +Future> tryGetTenantTransaction(Transaction tr, TenantName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + return ManagementClusterMetadata::tenantMetadata().tenantMap.get(tr, name); +} + +ACTOR template +Future> tryGetTenant(Reference db, TenantName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional entry = wait(tryGetTenantTransaction(tr, name)); + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future getTenantTransaction(Transaction tr, TenantName name) { + Optional entry = wait(tryGetTenantTransaction(tr, name)); + if (!entry.present()) { + throw tenant_not_found(); + } + + return entry.get(); +} + +ACTOR template +Future getTenant(Reference db, TenantName name) { + Optional entry = wait(tryGetTenant(db, name)); + if (!entry.present()) { + throw tenant_not_found(); + } + + return entry.get(); +} + +ACTOR template +Future managementClusterCheckEmpty(Transaction tr) { + state Future>> tenantsFuture = + TenantMetadata::tenantMap().getRange(tr, {}, {}, 1); + state typename transaction_future_type::type dbContentsFuture = + tr->getRange(normalKeys, 1); + + KeyBackedRangeResult> tenants = wait(tenantsFuture); + if (!tenants.results.empty()) { + throw cluster_not_empty(); + } + + RangeResult dbContents = wait(safeThreadFutureToFuture(dbContentsFuture)); + if (!dbContents.empty()) { + throw cluster_not_empty(); + } + + return Void(); +} + +ACTOR template +Future> createMetacluster(Reference db, ClusterName name) { + state Reference tr = db->createTransaction(); + state Optional metaclusterUid; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Future> metaclusterRegistrationFuture = + MetaclusterMetadata::metaclusterRegistration().get(tr); + + wait(managementClusterCheckEmpty(tr)); + + Optional existingRegistration = wait(metaclusterRegistrationFuture); + if (existingRegistration.present()) { + if (metaclusterUid.present() && metaclusterUid.get() == existingRegistration.get().metaclusterId) { + return Optional(); + } else { + return format("cluster is already registered as a %s named `%s'", + existingRegistration.get().clusterType == ClusterType::METACLUSTER_DATA + ? "data cluster" + : "metacluster", + printable(existingRegistration.get().name).c_str()); + } + } + + if (!metaclusterUid.present()) { + metaclusterUid = deterministicRandom()->randomUniqueID(); + } + + MetaclusterMetadata::metaclusterRegistration().set( + tr, MetaclusterRegistrationEntry(name, metaclusterUid.get())); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return Optional(); +} + +ACTOR template +Future decommissionMetacluster(Reference db) { + state Reference tr = db->createTransaction(); + state bool firstTry = true; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType != ClusterType::METACLUSTER_MANAGEMENT) { + if (firstTry) { + throw invalid_metacluster_operation(); + } else { + return Void(); + } + } + + // Erase all metadata not associated with specific tenants prior to checking + // cluster emptiness + ManagementClusterMetadata::tenantMetadata().tenantCount.clear(tr); + ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr); + ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr); + ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr); + + wait(managementClusterCheckEmpty(tr)); + MetaclusterMetadata::metaclusterRegistration().clear(tr); + + firstTry = false; + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return Void(); +} + +template +void updateClusterCapacityIndex(Transaction tr, + ClusterName name, + DataClusterEntry const& previousEntry, + DataClusterEntry const& updatedEntry) { + // Entries are put in the cluster capacity index ordered by how many items are already allocated to them + if (previousEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.erase( + tr, Tuple::makeTuple(previousEntry.allocated.numTenantGroups, name)); + } + if (updatedEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.insert( + tr, Tuple::makeTuple(updatedEntry.allocated.numTenantGroups, name)); + } +} + +// This should only be called from a transaction that has already confirmed that the cluster entry +// is present. The updatedEntry should use the existing entry and modify only those fields that need +// to be changed. +template +void updateClusterMetadata(Transaction tr, + ClusterNameRef name, + DataClusterMetadata const& previousMetadata, + Optional const& updatedConnectionString, + Optional const& updatedEntry) { + + if (updatedEntry.present()) { + if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } + ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); + updateClusterCapacityIndex(tr, name, previousMetadata.entry, updatedEntry.get()); + } + if (updatedConnectionString.present()) { + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, name, updatedConnectionString.get()); + } +} + +template +struct RegisterClusterImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + ClusterName clusterName; + ClusterConnectionString connectionString; + DataClusterEntry clusterEntry; + + RegisterClusterImpl(Reference managementDb, + ClusterName clusterName, + ClusterConnectionString connectionString, + DataClusterEntry clusterEntry) + : ctx(managementDb), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry) {} + + // Check that cluster name is available + ACTOR static Future registrationPrecheck(RegisterClusterImpl* self, Reference tr) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (dataClusterMetadata.present()) { + throw cluster_already_exists(); + } + + return Void(); + } + + ACTOR static Future configureDataCluster(RegisterClusterImpl* self) { + state Reference dataClusterDb = wait(openDatabase(self->connectionString)); + state Reference tr = dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Future>> existingTenantsFuture = + TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1); + state ThreadFuture existingDataFuture = tr->getRange(normalKeys, 1); + + // Check whether this cluster has already been registered + state Optional existingRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + if (existingRegistration.present()) { + if (existingRegistration.get().clusterType != ClusterType::METACLUSTER_DATA || + existingRegistration.get().name != self->clusterName || + !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { + throw cluster_already_registered(); + } else { + // We already successfully registered the cluster with these details, so there's nothing to do + self->clusterEntry.id = existingRegistration.get().id; + return Void(); + } + } + + // Check for any existing data + std::vector> existingTenants = + wait(safeThreadFutureToFuture(existingTenantsFuture)); + if (!existingTenants.empty()) { + TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName); + throw cluster_not_empty(); + } + + RangeResult existingData = wait(safeThreadFutureToFuture(existingDataFuture)); + if (!existingData.empty()) { + TraceEvent(SevWarn, "CannotRegisterClusterWithData").detail("ClusterName", self->clusterName); + throw cluster_not_empty(); + } + + self->clusterEntry.id = deterministicRandom()->randomUniqueID(); + MetaclusterMetadata::metaclusterRegistration().set( + tr, + self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, + self->clusterEntry.id)); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + + TraceEvent("ConfiguredDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + // Store the cluster entry for the new cluster + ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, + Reference tr) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration( + DataClusterMetadata(self->clusterEntry, self->connectionString))) { + throw cluster_already_exists(); + } else if (!dataClusterMetadata.present()) { + self->clusterEntry.allocated = ClusterUsage(); + + if (self->clusterEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.insert( + tr, Tuple::makeTuple(self->clusterEntry.allocated.numTenantGroups, self->clusterName)); + } + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); + } + + TraceEvent("RegisteredDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } + + ACTOR static Future run(RegisterClusterImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return registrationPrecheck(self, tr); })); + // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we don't + // have a metacluster registration on the data cluster + wait(configureDataCluster(self)); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future registerCluster(Reference db, + ClusterName name, + ClusterConnectionString connectionString, + DataClusterEntry entry) { + state RegisterClusterImpl impl(db, name, connectionString, entry); + wait(impl.run()); + return Void(); +} + +ACTOR template +Future restoreCluster(Reference db, + ClusterName name, + std::string connectionString, + DataClusterEntry entry, + AddNewTenants addNewTenants, + RemoveMissingTenants removeMissingTenants) { + // TODO: add implementation + wait(delay(0.0)); + return Void(); +} + +template +struct RemoveClusterImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + bool forceRemove; + + // Parameters set in markClusterRemoving + Optional lastTenantId; + + RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) + : ctx(managementDb, clusterName), forceRemove(forceRemove) {} + + // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. + ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { + if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) { + throw cluster_not_empty(); + } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::REMOVING) { + // Mark the cluster in a removing state while we finish the remaining removal steps. This prevents new + // tenants from being assigned to it. + DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; + updatedEntry.clusterState = DataClusterState::REMOVING; + updatedEntry.capacity.numTenantGroups = 0; + + updateClusterMetadata(tr, + self->ctx.clusterName.get(), + self->ctx.dataClusterMetadata.get(), + Optional(), + updatedEntry); + } + + ManagementClusterMetadata::clusterCapacityIndex.erase( + tr, + Tuple::makeTuple(self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups, + self->ctx.clusterName.get())); + + // Get the last allocated tenant ID to be used on the detached data cluster + if (self->forceRemove) { + Optional lastId = wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.get(tr)); + self->lastTenantId = lastId; + } + + TraceEvent("MarkedDataClusterRemoving") + .detail("Name", self->ctx.clusterName.get()) + .detail("Version", tr->getCommittedVersion()); + + return true; + } + + // Delete metacluster metadata from the data cluster + ACTOR static Future updateDataCluster(RemoveClusterImpl* self, Reference tr) { + // Delete metacluster related metadata + MetaclusterMetadata::metaclusterRegistration().clear(tr); + TenantMetadata::tenantTombstones().clear(tr); + TenantMetadata::tombstoneCleanupData().clear(tr); + + // If we are force removing a cluster, then it will potentially contain tenants that have IDs + // larger than the next tenant ID to be allocated on the cluster. To avoid collisions, we advance + // the ID so that it will be the larger of the current one on the data cluster and the management + // cluster. + if (self->lastTenantId.present()) { + Optional lastId = wait(TenantMetadata::lastTenantId().get(tr)); + if (!lastId.present() || lastId.get() < self->lastTenantId.get()) { + TenantMetadata::lastTenantId().set(tr, self->lastTenantId.get()); + } + } + + TraceEvent("ReconfiguredDataCluster") + .detail("Name", self->ctx.clusterName.get()) + .detail("Version", tr->getCommittedVersion()); + + return Void(); + } + + // Returns true if all tenants have been purged + ACTOR static Future purgeTenants(RemoveClusterImpl* self, + Reference tr, + std::pair clusterTupleRange) { + ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING); + + // Get the list of tenants + state Future> tenantEntriesFuture = + ManagementClusterMetadata::clusterTenantIndex.getRange( + tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->REMOVE_CLUSTER_TENANT_BATCH_SIZE); + + state KeyBackedRangeResult tenantEntries = wait(tenantEntriesFuture); + + // Erase each tenant from the tenant map on the management cluster + for (Tuple entry : tenantEntries.results) { + ASSERT(entry.getString(0) == self->ctx.clusterName.get()); + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1)); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2)); + } + + // Erase all of the tenants processed in this transaction from the cluster tenant index + if (!tenantEntries.results.empty()) { + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, + clusterTupleRange.first, + Tuple::makeTuple(self->ctx.clusterName.get(), keyAfter(tenantEntries.results.rbegin()->getString(1)))); + } + + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp( + tr, -tenantEntries.results.size(), MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, self->ctx.clusterName.get(), -tenantEntries.results.size(), MutationRef::AddValue); + + return !tenantEntries.more; + } + + // Returns true if all tenant groups and the data cluster have been purged + ACTOR static Future purgeTenantGroupsAndDataCluster(RemoveClusterImpl* self, + Reference tr, + std::pair clusterTupleRange) { + ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING); + + // Get the list of tenant groups + state Future> tenantGroupEntriesFuture = + ManagementClusterMetadata::clusterTenantGroupIndex.getRange( + tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->REMOVE_CLUSTER_TENANT_BATCH_SIZE); + + // Erase each tenant group from the tenant group map and the tenant group tenant index + state KeyBackedRangeResult tenantGroupEntries = wait(tenantGroupEntriesFuture); + for (Tuple entry : tenantGroupEntries.results) { + ASSERT(entry.getString(0) == self->ctx.clusterName.get()); + TenantGroupName tenantGroup = entry.getString(1); + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase( + tr, Tuple::makeTuple(tenantGroup), Tuple::makeTuple(keyAfter(tenantGroup))); + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.erase(tr, tenantGroup); + } + + if (!tenantGroupEntries.results.empty()) { + // Erase all of the tenant groups processed in this transaction from the cluster tenant group index + ManagementClusterMetadata::clusterTenantGroupIndex.erase( + tr, + clusterTupleRange.first, + Tuple::makeTuple(self->ctx.clusterName.get(), + keyAfter(tenantGroupEntries.results.rbegin()->getString(1)))); + } + + // Erase the data cluster record from the management cluster if processing our last batch + if (!tenantGroupEntries.more) { + ManagementClusterMetadata::dataClusters().erase(tr, self->ctx.clusterName.get()); + ManagementClusterMetadata::dataClusterConnectionRecords.erase(tr, self->ctx.clusterName.get()); + ManagementClusterMetadata::clusterTenantCount.erase(tr, self->ctx.clusterName.get()); + } + + return !tenantGroupEntries.more; + } + + // Remove all metadata associated with the data cluster from the management cluster + ACTOR static Future managementClusterPurgeDataCluster(RemoveClusterImpl* self) { + state std::pair clusterTupleRange = std::make_pair( + Tuple::makeTuple(self->ctx.clusterName.get()), Tuple::makeTuple(keyAfter(self->ctx.clusterName.get()))); + + // First remove all tenants associated with the data cluster from the management cluster + loop { + bool clearedAll = wait(self->ctx.runManagementTransaction( + [self = self, clusterTupleRange = clusterTupleRange](Reference tr) { + return purgeTenants(self, tr, clusterTupleRange); + })); + + if (clearedAll) { + break; + } + } + + // Next remove all tenant groups associated with the data cluster from the management cluster + loop { + bool clearedAll = wait(self->ctx.runManagementTransaction( + [self = self, clusterTupleRange = clusterTupleRange](Reference tr) { + return purgeTenantGroupsAndDataCluster(self, tr, clusterTupleRange); + })); + if (clearedAll) { + break; + } + } + + TraceEvent("RemovedDataCluster").detail("Name", self->ctx.clusterName.get()); + return Void(); + } + + ACTOR static Future run(RemoveClusterImpl* self) { + state bool clusterIsPresent; + try { + wait(store(clusterIsPresent, + self->ctx.runManagementTransaction([self = self](Reference tr) { + return markClusterRemoving(self, tr); + }))); + } catch (Error& e) { + // If the transaction retries after success or if we are trying a second time to remove the cluster, it will + // throw an error indicating that the removal has already started + if (e.code() == error_code_cluster_removed) { + clusterIsPresent = true; + } else { + throw; + } + } + + if (clusterIsPresent) { + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + } catch (Error& e) { + // If this transaction gets retried, the metacluster information may have already been erased. + if (e.code() != error_code_invalid_metacluster_operation) { + throw; + } + } + + // This runs multiple transactions, so the run transaction calls are inside the function + try { + wait(managementClusterPurgeDataCluster(self)); + } catch (Error& e) { + // If this transaction gets retried, the cluster may have already been deleted. + if (e.code() != error_code_cluster_not_found) { + throw; + } + } + } + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future removeCluster(Reference db, ClusterName name, bool forceRemove) { + state RemoveClusterImpl impl(db, name, forceRemove); + wait(impl.run()); + return Void(); +} + +ACTOR template +Future> listClustersTransaction(Transaction tr, + ClusterNameRef begin, + ClusterNameRef end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + state Future tenantModeCheck = TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT); + + state Future>> clusterEntriesFuture = + ManagementClusterMetadata::dataClusters().getRange(tr, begin, end, limit); + state Future>> connectionStringFuture = + ManagementClusterMetadata::dataClusterConnectionRecords.getRange(tr, begin, end, limit); + + wait(tenantModeCheck); + + state KeyBackedRangeResult> clusterEntries = + wait(safeThreadFutureToFuture(clusterEntriesFuture)); + KeyBackedRangeResult> connectionStrings = + wait(safeThreadFutureToFuture(connectionStringFuture)); + + ASSERT(clusterEntries.results.size() == connectionStrings.results.size()); + + std::map clusters; + for (int i = 0; i < clusterEntries.results.size(); ++i) { + ASSERT(clusterEntries.results[i].first == connectionStrings.results[i].first); + clusters[clusterEntries.results[i].first] = + DataClusterMetadata(clusterEntries.results[i].second, connectionStrings.results[i].second); + } + + return clusters; +} + +ACTOR template +Future> listClusters(Reference db, + ClusterName begin, + ClusterName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + std::map clusters = wait(listClustersTransaction(tr, begin, end, limit)); + + return clusters; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +template +void managementClusterAddTenantToGroup(Transaction tr, + TenantName tenantName, + TenantMapEntry tenantEntry, + DataClusterMetadata* clusterMetadata, + bool groupAlreadyExists) { + if (tenantEntry.tenantGroup.present()) { + if (tenantEntry.tenantGroup.get().startsWith("\xff"_sr)) { + throw invalid_tenant_group_name(); + } + + if (!groupAlreadyExists) { + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.set( + tr, tenantEntry.tenantGroup.get(), TenantGroupEntry(tenantEntry.assignedCluster)); + ManagementClusterMetadata::clusterTenantGroupIndex.insert( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantGroup.get())); + } + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.insert( + tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName)); + } + + if (!groupAlreadyExists) { + ASSERT(clusterMetadata->entry.hasCapacity()); + + DataClusterEntry updatedEntry = clusterMetadata->entry; + ++updatedEntry.allocated.numTenantGroups; + + updateClusterMetadata( + tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional(), updatedEntry); + + clusterMetadata->entry = updatedEntry; + } +} + +ACTOR template +Future managementClusterRemoveTenantFromGroup(Transaction tr, + TenantName tenantName, + TenantMapEntry tenantEntry, + DataClusterMetadata* clusterMetadata, + bool isRenamePair = false) { + state bool updateClusterCapacity = !tenantEntry.tenantGroup.present() && !isRenamePair; + if (tenantEntry.tenantGroup.present()) { + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName)); + + state KeyBackedSet::RangeResultType result = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.getRange( + tr, + Tuple::makeTuple(tenantEntry.tenantGroup.get()), + Tuple::makeTuple(keyAfter(tenantEntry.tenantGroup.get())), + 1)); + + if (result.results.size() == 0) { + ManagementClusterMetadata::clusterTenantGroupIndex.erase( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantGroup.get())); + + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.erase(tr, tenantEntry.tenantGroup.get()); + updateClusterCapacity = true; + } + } + + // Update the tenant group count information for the assigned cluster if this tenant group was erased so we + // can use the freed capacity. + if (updateClusterCapacity) { + DataClusterEntry updatedEntry = clusterMetadata->entry; + --updatedEntry.allocated.numTenantGroups; + updateClusterMetadata( + tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional(), updatedEntry); + + clusterMetadata->entry = updatedEntry; + } + + return Void(); +} + +template +struct CreateTenantImpl { + MetaclusterOperationContext ctx; + bool preferAssignedCluster; + + // Initialization parameters + TenantName tenantName; + TenantMapEntry tenantEntry; + + // Parameter set if tenant creation permanently fails on the data cluster + Optional replaceExistingTenantId; + + CreateTenantImpl(Reference managementDb, + bool preferAssignedCluster, + TenantName tenantName, + TenantMapEntry tenantEntry) + : ctx(managementDb), preferAssignedCluster(preferAssignedCluster), tenantName(tenantName), + tenantEntry(tenantEntry) {} + + ACTOR static Future checkClusterAvailability(Reference dataClusterDb, + ClusterName clusterName) { + state Reference tr = dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->addWriteConflictRange(KeyRangeRef("\xff/metacluster/availability_check"_sr, + "\xff/metacluster/availability_check\x00"_sr)); + wait(safeThreadFutureToFuture(tr->commit())); + return clusterName; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + // Returns true if the tenant is already assigned and can proceed to the next step and false if it needs + // to be created. Throws an error if the tenant already exists and cannot be created. + ACTOR static Future checkForExistingTenant(CreateTenantImpl* self, Reference tr) { + // Check if the tenant already exists. If it's partially created and matches the parameters we + // specified, continue creating it. Otherwise, fail with an error. + state Optional existingEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + if (existingEntry.present()) { + if (!existingEntry.get().matchesConfiguration(self->tenantEntry) || + existingEntry.get().tenantState != TenantState::REGISTERING) { + // The tenant already exists and is either completely created or has a different + // configuration + throw tenant_already_exists(); + } else if (!self->replaceExistingTenantId.present() || + self->replaceExistingTenantId.get() != existingEntry.get().id) { + // The tenant creation has already started, so resume where we left off + ASSERT(existingEntry.get().assignedCluster.present()); + if (self->preferAssignedCluster && + existingEntry.get().assignedCluster.get() != self->tenantEntry.assignedCluster.get()) { + TraceEvent("MetaclusterCreateTenantClusterMismatch") + .detail("Preferred", self->tenantEntry.assignedCluster.get()) + .detail("Actual", existingEntry.get().assignedCluster.get()); + throw invalid_tenant_configuration(); + } + self->tenantEntry = existingEntry.get(); + wait(self->ctx.setCluster(tr, existingEntry.get().assignedCluster.get())); + return true; + } else { + // The previous creation is permanently failed, so cleanup the tenant and create it again from scratch + // We don't need to remove it from the tenant map because we will overwrite the existing entry later in + // this transaction. + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, existingEntry.get().id); + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, existingEntry.get().assignedCluster.get(), -1, MutationRef::AddValue); + + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, + Tuple::makeTuple( + existingEntry.get().assignedCluster.get(), self->tenantName, existingEntry.get().id)); + + state DataClusterMetadata previousAssignedClusterMetadata = + wait(getClusterTransaction(tr, existingEntry.get().assignedCluster.get())); + + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, existingEntry.get(), &previousAssignedClusterMetadata)); + } + } else if (self->replaceExistingTenantId.present()) { + throw tenant_removed(); + } + + return false; + } + + // Returns a pair with the name of the assigned cluster and whether the group was already assigned + ACTOR static Future> assignTenant(CreateTenantImpl* self, + Reference tr) { + // If our tenant group is already assigned, then we just use that assignment + state Optional groupEntry; + if (self->tenantEntry.tenantGroup.present()) { + Optional _groupEntry = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get( + tr, self->tenantEntry.tenantGroup.get())); + groupEntry = _groupEntry; + + if (groupEntry.present()) { + ASSERT(groupEntry.get().assignedCluster.present()); + if (self->preferAssignedCluster && + groupEntry.get().assignedCluster.get() != self->tenantEntry.assignedCluster.get()) { + TraceEvent("MetaclusterCreateTenantGroupClusterMismatch") + .detail("TenantGroupCluster", groupEntry.get().assignedCluster.get()) + .detail("SpecifiedCluster", self->tenantEntry.assignedCluster.get()); + throw invalid_tenant_configuration(); + } + return std::make_pair(groupEntry.get().assignedCluster.get(), true); + } + } + + state std::vector>> dataClusterDbs; + state std::vector dataClusterNames; + state std::vector> clusterAvailabilityChecks; + // Get a set of the most full clusters that still have capacity + // If preferred cluster is specified, look for that one. + if (self->preferAssignedCluster) { + DataClusterMetadata dataClusterMetadata = + wait(getClusterTransaction(tr, self->tenantEntry.assignedCluster.get())); + if (!dataClusterMetadata.entry.hasCapacity()) { + throw cluster_no_capacity(); + } + dataClusterNames.push_back(self->tenantEntry.assignedCluster.get()); + } else { + state KeyBackedSet::RangeResultType availableClusters = + wait(ManagementClusterMetadata::clusterCapacityIndex.getRange( + tr, + {}, + {}, + CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, + Snapshot::False, + Reverse::True)); + if (availableClusters.results.empty()) { + throw metacluster_no_capacity(); + } + for (auto clusterTuple : availableClusters.results) { + dataClusterNames.push_back(clusterTuple.getString(1)); + } + } + for (auto dataClusterName : dataClusterNames) { + dataClusterDbs.push_back(getAndOpenDatabase(tr, dataClusterName)); + } + wait(waitForAll(dataClusterDbs)); + // Check the availability of our set of clusters + for (int i = 0; i < dataClusterDbs.size(); ++i) { + clusterAvailabilityChecks.push_back(checkClusterAvailability(dataClusterDbs[i].get(), dataClusterNames[i])); + } + + // Wait for a successful availability check from some cluster. We prefer the most full cluster, but if it + // doesn't return quickly we may choose another. + Optional clusterAvailabilityCheck = wait(timeout( + success(clusterAvailabilityChecks[0]) || (delay(CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY) && + waitForAny(clusterAvailabilityChecks)), + CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT)); + + if (!clusterAvailabilityCheck.present()) { + // If no clusters were available for long enough, then we throw an error and try again + throw transaction_too_old(); + } + + // Get the first cluster that was available + state Optional chosenCluster; + for (auto f : clusterAvailabilityChecks) { + if (f.isReady()) { + chosenCluster = f.get(); + break; + } + } + + ASSERT(chosenCluster.present()); + return std::make_pair(chosenCluster.get(), false); + } + + ACTOR static Future assignTenantAndStoreInManagementCluster(CreateTenantImpl* self, + Reference tr) { + // If the tenant already exists, we either throw an error from this function or move on to the next phase + bool tenantExists = wait(checkForExistingTenant(self, tr)); + if (tenantExists) { + return Void(); + } + + // Choose a cluster for the tenant + state std::pair assignment = wait(assignTenant(self, tr)); + self->tenantEntry.assignedCluster = assignment.first; + + // Update the context with the chosen cluster + state Future setClusterFuture = self->ctx.setCluster(tr, assignment.first); + + // Create a tenant entry in the management cluster + Optional lastId = wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.get(tr)); + self->tenantEntry.setId(lastId.orDefault(-1) + 1); + ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, self->tenantEntry.id); + + self->tenantEntry.tenantState = TenantState::REGISTERING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName); + + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, self->tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD( + tr, self->tenantEntry.assignedCluster.get(), Snapshot::False, 0)); + + if (clusterTenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(self->tenantEntry.assignedCluster.get(), self->tenantName, self->tenantEntry.id)); + + wait(setClusterFuture); + + // If we are part of a tenant group that is assigned to a cluster being removed from the metacluster, + // then we fail with an error. + if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } + + managementClusterAddTenantToGroup( + tr, self->tenantName, self->tenantEntry, &self->ctx.dataClusterMetadata.get(), assignment.second); + + return Void(); + } + + ACTOR static Future storeTenantInDataCluster(CreateTenantImpl* self, Reference tr) { + std::pair, bool> dataClusterTenant = wait( + TenantAPI::createTenantTransaction(tr, self->tenantName, self->tenantEntry, ClusterType::METACLUSTER_DATA)); + + // If the tenant map entry is empty, then we encountered a tombstone indicating that the tenant was + // simultaneously removed. + if (!dataClusterTenant.first.present()) { + throw tenant_removed(); + } + + return Void(); + } + + ACTOR static Future markTenantReady(CreateTenantImpl* self, Reference tr) { + state Optional managementEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + if (!managementEntry.present()) { + throw tenant_removed(); + } else if (managementEntry.get().id != self->tenantEntry.id) { + throw tenant_already_exists(); + } + + if (managementEntry.get().tenantState == TenantState::REGISTERING) { + TenantMapEntry updatedEntry = managementEntry.get(); + updatedEntry.tenantState = TenantState::READY; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + } + + return Void(); + } + + ACTOR static Future run(CreateTenantImpl* self) { + if (self->tenantName.startsWith("\xff"_sr)) { + throw invalid_tenant_name(); + } + + loop { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return assignTenantAndStoreInManagementCluster(self, tr); + })); + + self->replaceExistingTenantId = {}; + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return storeTenantInDataCluster(self, tr); })); + + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markTenantReady(self, tr); })); + + return Void(); + } catch (Error& e) { + if (e.code() == error_code_tenant_creation_permanently_failed) { + // If the data cluster has permanently failed to create the tenant, then we can reassign it in + // the management cluster and start over + self->replaceExistingTenantId = self->tenantEntry.id; + self->ctx.clearCluster(); + } else { + throw; + } + } + } + } + Future run() { return run(this); } +}; + +ACTOR template +Future createTenant(Reference db, TenantName name, TenantMapEntry tenantEntry) { + state CreateTenantImpl impl(db, tenantEntry.assignedCluster.present(), name, tenantEntry); + wait(impl.run()); + return Void(); +} + +template +struct DeleteTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName tenantName; + + // Parameters set in getAssignedLocation + int64_t tenantId; + + // Parameters set in markTenantInRemovingState + Optional pairName; + + DeleteTenantImpl(Reference managementDb, TenantName tenantName) : ctx(managementDb), tenantName(tenantName) {} + + // Loads the cluster details for the cluster where the tenant is assigned. + // Returns true if the deletion is already in progress + ACTOR static Future getAssignedLocation(DeleteTenantImpl* self, Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present()) { + throw tenant_not_found(); + } + + // Disallow removing the "new" name of a renamed tenant before it completes + if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) { + throw tenant_not_found(); + } + + if (tenantEntry.get().tenantState == TenantState::REMOVING) { + if (tenantEntry.get().renamePair.present()) { + self->pairName = tenantEntry.get().renamePair.get(); + } + } + + self->tenantId = tenantEntry.get().id; + wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get())); + return tenantEntry.get().tenantState == TenantState::REMOVING; + } + + // Does an initial check if the tenant is empty. This is an optimization to prevent us marking a tenant + // in the deleted state while it has data, but it is still possible that data gets added to it after this + // point. + // + // SOMEDAY: should this also lock the tenant when locking is supported? + ACTOR static Future checkTenantEmpty(DeleteTenantImpl* self, Reference tr) { + state Optional tenantEntry = wait(TenantAPI::tryGetTenantTransaction(tr, self->tenantName)); + if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { + // The tenant must have been removed simultaneously + return Void(); + } + + ThreadFuture rangeFuture = tr->getRange(prefixRange(tenantEntry.get().prefix), 1); + RangeResult result = wait(safeThreadFutureToFuture(rangeFuture)); + if (!result.empty()) { + throw tenant_not_empty(); + } + + return Void(); + } + + // Mark the tenant as being in a removing state on the management cluster + ACTOR static Future markTenantInRemovingState(DeleteTenantImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { + throw tenant_not_found(); + } + + if (tenantEntry.get().tenantState != TenantState::REMOVING) { + // Disallow removing the "new" name of a renamed tenant before it completes + if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) { + throw tenant_not_found(); + } + state TenantMapEntry updatedEntry = tenantEntry.get(); + // Check if we are deleting a tenant in the middle of a rename + if (updatedEntry.renamePair.present()) { + ASSERT(updatedEntry.tenantState == TenantState::RENAMING_FROM); + self->pairName = updatedEntry.renamePair.get(); + } + updatedEntry.tenantState = TenantState::REMOVING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + // If this has a rename pair, also mark the other entry for deletion + if (self->pairName.present()) { + state Optional pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get())); + TenantMapEntry updatedPairEntry = pairEntry.get(); + // Sanity check that our pair has us named as their partner + ASSERT(updatedPairEntry.renamePair.present()); + ASSERT(updatedPairEntry.renamePair.get() == self->tenantName); + ASSERT(updatedPairEntry.id == self->tenantId); + CODE_PROBE(true, "marking pair tenant in removing state"); + updatedPairEntry.tenantState = TenantState::REMOVING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry); + } + } + + return Void(); + } + + // Delete the tenant and related metadata on the management cluster + ACTOR static Future deleteTenantFromManagementCluster(DeleteTenantImpl* self, + Reference tr, + bool pairDelete = false) { + // If pair is present, and this is not already a pair delete, call this function recursively + state Future pairFuture = Void(); + if (!pairDelete && self->pairName.present()) { + CODE_PROBE(true, "deleting pair tenant from management cluster"); + pairFuture = deleteTenantFromManagementCluster(self, tr, true); + } + state TenantName tenantName = pairDelete ? self->pairName.get() : self->tenantName; + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { + return Void(); + } + + ASSERT(tenantEntry.get().tenantState == TenantState::REMOVING); + + // Erase the tenant entry itself + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id); + + // This is idempotent because this function is only called if the tenant is in the map + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.get().assignedCluster.get(), -1, MutationRef::AddValue); + + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.get().assignedCluster.get(), tenantName, self->tenantId)); + + // Remove the tenant from its tenant group + wait(managementClusterRemoveTenantFromGroup( + tr, tenantName, tenantEntry.get(), &self->ctx.dataClusterMetadata.get(), pairDelete)); + + wait(pairFuture); + return Void(); + } + + ACTOR static Future run(DeleteTenantImpl* self) { + // Get information about the tenant and where it is assigned + bool deletionInProgress = wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return getAssignedLocation(self, tr); })); + + if (!deletionInProgress) { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return checkTenantEmpty(self, tr); })); + + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return markTenantInRemovingState(self, tr); + })); + } + + // Delete tenant on the data cluster + wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { + // If the removed tenant is being renamed, attempt to delete both the old and new names. + // At most one should be present with the given ID, and the other will be a no-op. + Future pairDelete = Void(); + if (self->pairName.present()) { + CODE_PROBE(true, "deleting pair tenant from data cluster"); + pairDelete = TenantAPI::deleteTenantTransaction( + tr, self->pairName.get(), self->tenantId, ClusterType::METACLUSTER_DATA); + } + return pairDelete && TenantAPI::deleteTenantTransaction( + tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA); + })); + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return deleteTenantFromManagementCluster(self, tr); + })); + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future deleteTenant(Reference db, TenantName name) { + state DeleteTenantImpl impl(db, name); + wait(impl.run()); + return Void(); +} + +ACTOR template +Future>> listTenantsTransaction(Transaction tr, + TenantNameRef begin, + TenantNameRef end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit)); + + return results.results; +} + +ACTOR template +Future>> listTenants(Reference db, + TenantName begin, + TenantName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + std::vector> tenants = + wait(listTenantsTransaction(tr, begin, end, limit)); + return tenants; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +template +struct ConfigureTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName tenantName; + std::map, Optional> configurationParameters; + + // Parameters set in updateManagementCluster + TenantMapEntry updatedEntry; + + ConfigureTenantImpl(Reference managementDb, + TenantName tenantName, + std::map, Optional> configurationParameters) + : ctx(managementDb), tenantName(tenantName), configurationParameters(configurationParameters) {} + + // This verifies that the tenant group can be changed, and if so it updates all of the tenant group data + // structures. It does not update the TenantMapEntry stored in the tenant map. + ACTOR static Future updateTenantGroup(ConfigureTenantImpl* self, + Reference tr, + TenantMapEntry tenantEntry, + Optional desiredGroup) { + + state TenantMapEntry entryWithUpdatedGroup = tenantEntry; + entryWithUpdatedGroup.tenantGroup = desiredGroup; + + if (tenantEntry.tenantGroup == desiredGroup) { + return Void(); + } + + // Removing a tenant group is only possible if we have capacity for more groups on the current cluster + else if (!desiredGroup.present()) { + if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) { + throw cluster_no_capacity(); + } + + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); + managementClusterAddTenantToGroup( + tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false); + return Void(); + } + + state Optional tenantGroupEntry = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, desiredGroup.get())); + + // If we are creating a new tenant group, we need to have capacity on the current cluster + if (!tenantGroupEntry.present()) { + if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) { + throw cluster_no_capacity(); + } + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); + managementClusterAddTenantToGroup( + tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false); + return Void(); + } + + // Moves between groups in the same cluster are freely allowed + else if (tenantGroupEntry.get().assignedCluster == tenantEntry.assignedCluster) { + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); + managementClusterAddTenantToGroup( + tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), true); + return Void(); + } + + // We don't currently support movement between groups on different clusters + else { + TraceEvent("TenantGroupChangeToDifferentCluster") + .detail("Tenant", self->tenantName) + .detail("OriginalGroup", tenantEntry.tenantGroup) + .detail("DesiredGroup", desiredGroup) + .detail("TenantAssignedCluster", tenantEntry.assignedCluster) + .detail("DesiredGroupAssignedCluster", tenantGroupEntry.get().assignedCluster); + + throw invalid_tenant_configuration(); + } + } + + // Updates the configuration in the management cluster and marks it as being in the UPDATING_CONFIGURATION state + ACTOR static Future updateManagementCluster(ConfigureTenantImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present()) { + throw tenant_not_found(); + } + + if (tenantEntry.get().tenantState != TenantState::READY && + tenantEntry.get().tenantState != TenantState::UPDATING_CONFIGURATION) { + throw invalid_tenant_state(); + } + + wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get())); + + self->updatedEntry = tenantEntry.get(); + self->updatedEntry.tenantState = TenantState::UPDATING_CONFIGURATION; + + state std::map, Optional>::iterator configItr; + for (configItr = self->configurationParameters.begin(); configItr != self->configurationParameters.end(); + ++configItr) { + if (configItr->first == "tenant_group"_sr) { + wait(updateTenantGroup(self, tr, self->updatedEntry, configItr->second)); + } + self->updatedEntry.configure(configItr->first, configItr->second); + } + + ++self->updatedEntry.configurationSequenceNum; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry); + + return Void(); + } + + // Updates the configuration in the data cluster + ACTOR static Future updateDataCluster(ConfigureTenantImpl* self, Reference tr) { + state Optional tenantEntry = wait(TenantAPI::tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->updatedEntry.id || + tenantEntry.get().configurationSequenceNum >= self->updatedEntry.configurationSequenceNum) { + // If the tenant isn't in the metacluster, it must have been concurrently removed + return Void(); + } + + TenantMapEntry dataClusterEntry = self->updatedEntry; + dataClusterEntry.tenantState = TenantState::READY; + dataClusterEntry.assignedCluster = {}; + + wait(TenantAPI::configureTenantTransaction(tr, self->tenantName, tenantEntry.get(), dataClusterEntry)); + return Void(); + } + + // Updates the tenant state in the management cluster to READY + ACTOR static Future markManagementTenantAsReady(ConfigureTenantImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->updatedEntry.id || + tenantEntry.get().tenantState != TenantState::UPDATING_CONFIGURATION || + tenantEntry.get().configurationSequenceNum > self->updatedEntry.configurationSequenceNum) { + return Void(); + } + + tenantEntry.get().tenantState = TenantState::READY; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get()); + return Void(); + } + + ACTOR static Future run(ConfigureTenantImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return updateManagementCluster(self, tr); })); + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markManagementTenantAsReady(self, tr); })); + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future configureTenant(Reference db, + TenantName name, + std::map, Optional> configurationParameters) { + state ConfigureTenantImpl impl(db, name, configurationParameters); + wait(impl.run()); + return Void(); +} + +template +struct RenameTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName oldName; + TenantName newName; + + // Parameters set in markTenantsInRenamingState + int64_t tenantId = -1; + int64_t configurationSequenceNum = -1; + + RenameTenantImpl(Reference managementDb, TenantName oldName, TenantName newName) + : ctx(managementDb), oldName(oldName), newName(newName) {} + + // Delete the tenant and related metadata on the management cluster + ACTOR static Future deleteTenantFromManagementCluster(RenameTenantImpl* self, + Reference tr, + TenantMapEntry tenantEntry) { + // Erase the tenant entry itself + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName); + + // Remove old tenant from tenant count + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.assignedCluster.get(), -1, MutationRef::AddValue); + + // Clean up cluster based tenant indices and remove the old entry from its tenant group + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), self->oldName, self->tenantId)); + + // Remove the tenant from its tenant group + wait(managementClusterRemoveTenantFromGroup( + tr, self->oldName, tenantEntry, &self->ctx.dataClusterMetadata.get(), true)); + + return Void(); + } + + ACTOR static Future markTenantsInRenamingState(RenameTenantImpl* self, + Reference tr) { + state TenantMapEntry oldTenantEntry; + state Optional newTenantEntry; + wait(store(oldTenantEntry, getTenantTransaction(tr, self->oldName)) && + store(newTenantEntry, tryGetTenantTransaction(tr, self->newName))); + + if (self->tenantId != -1 && oldTenantEntry.id != self->tenantId) { + // The tenant must have been removed simultaneously + CODE_PROBE(true, "Metacluster rename old tenant ID mismatch"); + throw tenant_removed(); + } + + // If marked for deletion, abort the rename + if (oldTenantEntry.tenantState == TenantState::REMOVING) { + CODE_PROBE(true, "Metacluster rename candidates marked for deletion"); + throw tenant_removed(); + } + + // If the new entry is present, we can only continue if this is a retry of the same rename + // To check this, verify both entries are in the correct state + // and have each other as pairs + if (newTenantEntry.present()) { + if (newTenantEntry.get().tenantState == TenantState::RENAMING_TO && + oldTenantEntry.tenantState == TenantState::RENAMING_FROM && newTenantEntry.get().renamePair.present() && + newTenantEntry.get().renamePair.get() == self->oldName && oldTenantEntry.renamePair.present() && + oldTenantEntry.renamePair.get() == self->newName) { + wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get())); + self->tenantId = newTenantEntry.get().id; + self->configurationSequenceNum = newTenantEntry.get().configurationSequenceNum; + CODE_PROBE(true, "Metacluster rename retry in progress"); + return Void(); + } else { + CODE_PROBE(true, "Metacluster rename new name already exists"); + throw tenant_already_exists(); + }; + } else { + if (self->tenantId == -1) { + self->tenantId = oldTenantEntry.id; + } + ++oldTenantEntry.configurationSequenceNum; + self->configurationSequenceNum = oldTenantEntry.configurationSequenceNum; + wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get())); + if (oldTenantEntry.tenantState != TenantState::READY) { + CODE_PROBE(true, "Metacluster unable to proceed with rename operation"); + throw invalid_tenant_state(); + } + } + + // Check cluster capacity. If we would exceed the amount due to temporary extra tenants + // then we deny the rename request altogether. + int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD( + tr, oldTenantEntry.assignedCluster.get(), Snapshot::False, 0)); + + if (clusterTenantCount + 1 > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + TenantMapEntry updatedOldEntry = oldTenantEntry; + TenantMapEntry updatedNewEntry(updatedOldEntry); + ASSERT(updatedOldEntry.configurationSequenceNum == self->configurationSequenceNum); + ASSERT(updatedNewEntry.configurationSequenceNum == self->configurationSequenceNum); + updatedOldEntry.tenantState = TenantState::RENAMING_FROM; + updatedNewEntry.tenantState = TenantState::RENAMING_TO; + updatedOldEntry.renamePair = self->newName; + updatedNewEntry.renamePair = self->oldName; + + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + + // Add temporary tenant to tenantCount to prevent exceeding capacity during a rename + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, updatedNewEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(updatedNewEntry.assignedCluster.get(), self->newName, self->tenantId)); + + // Add new name to tenant group. It should already exist since the old name was part of it. + managementClusterAddTenantToGroup( + tr, self->newName, updatedNewEntry, &self->ctx.dataClusterMetadata.get(), true); + return Void(); + } + + ACTOR static Future updateDataCluster(RenameTenantImpl* self, Reference tr) { + ASSERT(self->tenantId != -1); + ASSERT(self->configurationSequenceNum != -1); + wait(TenantAPI::renameTenantTransaction(tr, + self->oldName, + self->newName, + self->tenantId, + ClusterType::METACLUSTER_DATA, + self->configurationSequenceNum)); + return Void(); + } + + ACTOR static Future finishRenameFromManagementCluster(RenameTenantImpl* self, + Reference tr) { + state Optional oldTenantEntry; + state Optional newTenantEntry; + wait(store(oldTenantEntry, tryGetTenantTransaction(tr, self->oldName)) && + store(newTenantEntry, tryGetTenantTransaction(tr, self->newName))); + + // Another (or several other) operations have already removed/changed the old entry + // Possible for the new entry to also have been tampered with, + // so it may or may not be present with or without the same id, which are all + // legal states. Assume the rename completed properly in this case + if (!oldTenantEntry.present() || oldTenantEntry.get().id != self->tenantId || + oldTenantEntry.get().configurationSequenceNum > self->configurationSequenceNum) { + CODE_PROBE(true, + "Metacluster finished rename with missing entries, mismatched id, and/or mismatched " + "configuration sequence."); + return Void(); + } + if (oldTenantEntry.get().tenantState == TenantState::REMOVING) { + ASSERT(newTenantEntry.get().tenantState == TenantState::REMOVING); + throw tenant_removed(); + } + ASSERT(newTenantEntry.present()); + ASSERT(newTenantEntry.get().id == self->tenantId); + + TenantMapEntry updatedOldEntry = oldTenantEntry.get(); + TenantMapEntry updatedNewEntry = newTenantEntry.get(); + + // Only update if in the expected state + if (updatedNewEntry.tenantState == TenantState::RENAMING_TO) { + updatedNewEntry.tenantState = TenantState::READY; + updatedNewEntry.renamePair.reset(); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName); + } + + // We will remove the old entry from the management cluster + // This should still be the same old entry since the tenantId matches from the check above. + wait(deleteTenantFromManagementCluster(self, tr, updatedOldEntry)); + return Void(); + } + + ACTOR static Future run(RenameTenantImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markTenantsInRenamingState(self, tr); })); + + // Rename tenant on the data cluster + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + } catch (Error& e) { + // Since we track the tenant entries on the management cluster, these error codes should only appear + // on a retry of the transaction, typically caused by commit_unknown_result. + // Operating on the assumption that the first transaction completed successfully, we keep going + // so we can finish the rename on the management cluster. + if (e.code() == error_code_tenant_not_found || e.code() == error_code_tenant_already_exists) { + CODE_PROBE(true, "Metacluster rename ran into commit_unknown_result"); + } else { + throw e; + } + } + + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return finishRenameFromManagementCluster(self, tr); + })); + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future renameTenant(Reference db, TenantName oldName, TenantName newName) { + state RenameTenantImpl impl(db, oldName, newName); + wait(impl.run()); + return Void(); +} + +template +Future> tryGetTenantGroupTransaction(Transaction tr, TenantGroupName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + return ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, name); +} + +ACTOR template +Future> tryGetTenantGroup(Reference db, TenantGroupName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional entry = wait(tryGetTenantGroupTransaction(tr, name)); + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future>> listTenantGroupsTransaction(Transaction tr, + TenantGroupName begin, + TenantGroupName end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange(tr, begin, end, limit)); + + return results.results; +} + +ACTOR template +Future>> listTenantGroups(Reference db, + TenantGroupName begin, + TenantGroupName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + std::vector> tenantGroups = + wait(listTenantGroupsTransaction(tr, begin, end, limit)); + return tenantGroups; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +} // namespace MetaclusterAPI + +#include "flow/unactorcompiler.h" +#endif \ No newline at end of file diff --git a/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h b/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h index 265ce0b3fe..888c8ec01a 100644 --- a/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h +++ b/fdbclient/include/fdbclient/MultiVersionAssignmentVars.h @@ -28,16 +28,24 @@ template class AbortableSingleAssignmentVar final : public ThreadSingleAssignmentVar, public ThreadCallback { public: AbortableSingleAssignmentVar(ThreadFuture future, ThreadFuture abortSignal) - : future(future), abortSignal(abortSignal), hasBeenSet(false), callbacksCleared(false) { + : future(future), abortSignal(abortSignal), hasBeenSet(false), callbacksCleared(true) { int userParam; ThreadSingleAssignmentVar::addref(); ThreadSingleAssignmentVar::addref(); - // abortSignal comes first, because otherwise future could immediately call fire/error and attempt to remove - // this callback from abortSignal prematurely abortSignal.callOrSetAsCallback(this, userParam, 0); future.callOrSetAsCallback(this, userParam, 0); + + // One of the signals could be already fired + // Make sure that the other is cancelled, and the references removed + lock.enter(); + callbacksCleared = false; + bool hasBeenSet_ = hasBeenSet; + lock.leave(); + if (hasBeenSet_) { + cancelCallbacks(); + } } void cancel() override { @@ -104,12 +112,28 @@ private: callbacksCleared = true; lock.leave(); - future.getPtr()->addref(); // Cancel will delref our future, but we don't want to destroy it until this - // callback gets destroyed - future.getPtr()->cancel(); + bool notificationRequired = true; + + if (future.clearCallback(this)) { + ThreadSingleAssignmentVar::delref(); + } else { + notificationRequired = false; + future.getPtr()->addref(); // Cancel will delref our future, but we don't want to destroy it until this + // callback gets destroyed + future.getPtr()->cancel(); + } if (abortSignal.clearCallback(this)) { ThreadSingleAssignmentVar::delref(); + } else { + notificationRequired = false; + } + + if (notificationRequired) { + // The future has been cancelled before any of the signals were + // fired. Notify the futures about the cancellation + ASSERT(!hasBeenSet); + ThreadSingleAssignmentVar::sendError(operation_cancelled()); } } else { lock.leave(); diff --git a/fdbclient/include/fdbclient/MultiVersionTransaction.h b/fdbclient/include/fdbclient/MultiVersionTransaction.h index 4a59872c23..f74026d0ac 100644 --- a/fdbclient/include/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h @@ -26,6 +26,7 @@ #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/IClientApi.h" +#include "flow/ApiVersion.h" #include "flow/ProtocolVersion.h" #include "flow/ThreadHelper.actor.h" @@ -89,6 +90,14 @@ struct FdbCApi : public ThreadSafeReferenceCounted { const void* endKey; int endKeyLength; } FDBKeyRange; + + typedef struct granulesummary { + FDBKeyRange key_range; + int64_t snapshot_version; + int64_t snapshot_size; + int64_t delta_version; + int64_t delta_size; + } FDBGranuleSummary; #pragma pack(pop) typedef struct readgranulecontext { @@ -122,6 +131,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted { // Network fdb_error_t (*selectApiVersion)(int runtimeVersion, int headerVersion); const char* (*getClientVersion)(); + void (*useFutureProtocolVersion)(); + fdb_error_t (*setNetworkOption)(FDBNetworkOption option, uint8_t const* value, int valueLength); fdb_error_t (*setupNetwork)(); fdb_error_t (*runNetwork)(); @@ -169,6 +180,32 @@ struct FdbCApi : public ThreadSafeReferenceCounted { uint8_t const* purge_key_name, int purge_key_name_length); + FDBFuture* (*databaseBlobbifyRange)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + + FDBFuture* (*databaseUnblobbifyRange)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + + FDBFuture* (*databaseListBlobbifiedRanges)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int rangeLimit); + + FDBFuture* (*databaseVerifyBlobRange)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t version); + // Tenant fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction); @@ -270,20 +307,47 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int end_key_name_length, int64_t chunkSize); - FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* db, + FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, - int end_key_name_length); + int end_key_name_length, + int rangeLimit); - FDBResult* (*transactionReadBlobGranules)(FDBTransaction* db, + FDBResult* (*transactionReadBlobGranules)(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length, int64_t beginVersion, - int64_t readVersion, - FDBReadBlobGranuleContext granule_context); + int64_t readVersion); + + FDBFuture* (*transactionReadBlobGranulesStart)(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut); + + FDBResult* (*transactionReadBlobGranulesFinish)(FDBTransaction* tr, + FDBFuture* startFuture, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granule_context); + + FDBFuture* (*transactionSummarizeBlobGranules)(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t summaryVersion, + int rangeLimit); FDBFuture* (*transactionCommit)(FDBTransaction* tr); fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion); @@ -304,7 +368,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { fdb_error_t (*futureGetDatabase)(FDBFuture* f, FDBDatabase** outDb); fdb_error_t (*futureGetInt64)(FDBFuture* f, int64_t* outValue); fdb_error_t (*futureGetUInt64)(FDBFuture* f, uint64_t* outValue); - fdb_error_t (*futureGetBool)(FDBFuture* f, bool* outValue); + fdb_error_t (*futureGetBool)(FDBFuture* f, fdb_bool_t* outValue); fdb_error_t (*futureGetError)(FDBFuture* f); fdb_error_t (*futureGetKey)(FDBFuture* f, uint8_t const** outKey, int* outKeyLength); fdb_error_t (*futureGetValue)(FDBFuture* f, fdb_bool_t* outPresent, uint8_t const** outValue, int* outValueLength); @@ -316,6 +380,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { FDBMappedKeyValue const** outKVM, int* outCount, fdb_bool_t* outMore); + fdb_error_t (*futureGetGranuleSummaryArray)(FDBFuture* f, const FDBGranuleSummary** out_summaries, int* outCount); fdb_error_t (*futureGetSharedState)(FDBFuture* f, DatabaseSharedState** outPtr); fdb_error_t (*futureSetCallback)(FDBFuture* f, FDBCallback callback, void* callback_parameter); void (*futureCancel)(FDBFuture* f); @@ -374,13 +439,30 @@ public: ThreadFuture getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override; ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) override; - ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override; + ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + + ThreadFuture>> summarizeBlobGranules(const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) override; + void addReadConflictRange(const KeyRangeRef& keys) override; void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override; @@ -474,6 +556,12 @@ public: ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override; ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) override; + ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; + ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) override; + ThreadFuture createSharedState() override; void setSharedState(DatabaseSharedState* p) override; @@ -492,6 +580,7 @@ public: void selectApiVersion(int apiVersion) override; const char* getClientVersion() override; + void useFutureProtocolVersion() override; void setNetworkOption(FDBNetworkOptions::Option option, Optional value = Optional()) override; void setupNetwork() override; @@ -571,13 +660,30 @@ public: ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) override; - ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override; + ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + + ThreadFuture>> summarizeBlobGranules(const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) override; + void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override; void set(const KeyRef& key, const ValueRef& value) override; void clear(const KeyRef& begin, const KeyRef& end) override; @@ -643,6 +749,9 @@ private: template ThreadResult abortableTimeoutResult(ThreadFuture abortSignal); + template + ThreadResult abortableResult(ThreadResult result, ThreadFuture abortSignal); + TransactionInfo transaction; TransactionInfo getTransaction(); @@ -650,15 +759,15 @@ private: void setDefaultOptions(UniqueOrderedOptionList options); std::vector>>> persistentOptions; - - const Optional tenantName; }; struct ClientDesc { std::string const libPath; bool const external; + bool const useFutureVersion; - ClientDesc(std::string libPath, bool external) : libPath(libPath), external(external) {} + ClientDesc(std::string libPath, bool external, bool useFutureVersion) + : libPath(libPath), external(external), useFutureVersion(useFutureVersion) {} }; struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { @@ -667,17 +776,22 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { IClientApi* api; bool failed; std::atomic_bool initialized; + int threadIndex; std::vector> threadCompletionHooks; ClientInfo() - : ClientDesc(std::string(), false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {} + : ClientDesc(std::string(), false, false), protocolVersion(0), api(nullptr), failed(true), initialized(false), + threadIndex(0) {} ClientInfo(IClientApi* api) - : ClientDesc("internal", false), protocolVersion(0), api(api), failed(false), initialized(false) {} - ClientInfo(IClientApi* api, std::string libPath) - : ClientDesc(libPath, true), protocolVersion(0), api(api), failed(false), initialized(false) {} + : ClientDesc("internal", false, false), protocolVersion(0), api(api), failed(false), initialized(false), + threadIndex(0) {} + ClientInfo(IClientApi* api, std::string libPath, bool useFutureVersion, int threadIndex) + : ClientDesc(libPath, true, useFutureVersion), protocolVersion(0), api(api), failed(false), initialized(false), + threadIndex(threadIndex) {} void loadVersion(); bool canReplace(Reference other) const; + std::string getTraceFileIdentifier(const std::string& baseIdentifier); }; class MultiVersionApi; @@ -814,6 +928,12 @@ public: ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override; ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) override; + ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; + ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) override; + ThreadFuture createSharedState() override; void setSharedState(DatabaseSharedState* p) override; @@ -921,6 +1041,7 @@ class MultiVersionApi : public IClientApi { public: void selectApiVersion(int apiVersion) override; const char* getClientVersion() override; + void useFutureProtocolVersion() override; void setNetworkOption(FDBNetworkOptions::Option option, Optional value = Optional()) override; void setupNetwork() override; @@ -958,7 +1079,7 @@ public: }; std::map clusterSharedStateMap; - static bool apiVersionAtLeast(int minVersion); + ApiVersion getApiVersion() { return apiVersion; } private: MultiVersionApi(); @@ -967,7 +1088,7 @@ private: void disableMultiVersionClientApi(); void setCallbacksOnExternalThreads(); - void addExternalLibrary(std::string path); + void addExternalLibrary(std::string path, bool useFutureVersion); void addExternalLibraryDirectory(std::string path); // Return a vector of (pathname, unlink_on_close) pairs. Makes threadCount - 1 copies of the library stored in // path, and returns a vector of length threadCount. @@ -983,13 +1104,16 @@ private: bool networkStartSetup; volatile bool networkSetup; + bool disableBypass; volatile bool bypassMultiClientApi; volatile bool externalClient; - int apiVersion; + ApiVersion apiVersion; int nextThread = 0; int threadCount; std::string tmpDir; + bool traceShareBaseNameAmongThreads; + std::string traceFileIdentifier; Mutex lock; std::vector>>> options; diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 3abab222bb..3931182ab0 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -82,8 +82,6 @@ struct NetworkOptions { class Database { public: - enum { API_VERSION_LATEST = -1 }; - // Creates a database object that represents a connection to a cluster // This constructor uses a preallocated DatabaseContext that may have been created // on another thread @@ -98,6 +96,9 @@ public: IsInternal internal = IsInternal::True, LocalityData const& clientLocality = LocalityData()); + static Database createSimulatedExtraDatabase(std::string connectionString, + Optional defaultTenant = Optional()); + Database() {} // an uninitialized database can be destructed or reassigned safely; that's it void operator=(Database const& rhs) { db = rhs.db; } Database(Database const& rhs) : db(rhs.db) {} @@ -235,18 +236,27 @@ struct Watch : public ReferenceCounted, NonCopyable { void setWatch(Future watchFuture); }; +FDB_DECLARE_BOOLEAN_PARAM(AllowInvalidTenantID); + struct TransactionState : ReferenceCounted { Database cx; - int64_t tenantId = TenantInfo::INVALID_TENANT; + Optional> authToken; Reference trLogInfo; TransactionOptions options; + Optional readOptions; - Optional debugID; TaskPriority taskID; SpanContext spanContext; UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False; bool readVersionObtainedFromGrvProxy; + // Special flag to skip prepending tenant prefix to mutations and conflict ranges + // when a dummy, internal transaction gets commited. The sole purpose of commitDummyTransaction() is to + // resolve the state of earlier transaction that returned commit_unknown_result or request_maybe_delivered. + // Therefore, the dummy transaction can simply reuse one conflict range of the earlier commit, if it already has + // been prefixed. + bool skipApplyTenantPrefix = false; + int numErrors = 0; double startTime = 0; Promise> versionstampPromise; @@ -270,13 +280,23 @@ struct TransactionState : ReferenceCounted { Reference trLogInfo); Reference cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const; - TenantInfo getTenantInfo(); + TenantInfo getTenantInfo(AllowInvalidTenantID allowInvalidId = AllowInvalidTenantID::False); Optional const& tenant(); bool hasTenant() const; + int64_t tenantId() const { return tenantId_; } + void trySetTenantId(int64_t tenantId) { + if (tenantId_ == TenantInfo::INVALID_TENANT) { + tenantId_ = tenantId; + } + } + + Future handleUnknownTenant(); + private: Optional tenant_; + int64_t tenantId_ = TenantInfo::INVALID_TENANT; bool tenantSet; }; @@ -346,19 +366,19 @@ private: public: // A method for streaming data from the storage server that is more efficient than getRange when reading large // amounts of data - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeySelector& begin, const KeySelector& end, int limit, Snapshot = Snapshot::False, Reverse = Reverse::False); - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeySelector& begin, const KeySelector& end, GetRangeLimits limits, Snapshot = Snapshot::False, Reverse = Reverse::False); - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeyRange& keys, int limit, Snapshot snapshot = Snapshot::False, @@ -370,7 +390,7 @@ public: snapshot, reverse); } - [[nodiscard]] Future getRangeStream(const PromiseStream>& results, + [[nodiscard]] Future getRangeStream(PromiseStream>& results, const KeyRange& keys, GetRangeLimits limits, Snapshot snapshot = Snapshot::False, @@ -396,12 +416,18 @@ public: // The returned list would still be in form of [keys.begin, splitPoint1, splitPoint2, ... , keys.end] Future>> getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize); - Future>> getBlobGranuleRanges(const KeyRange& range); + Future>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit); Future>> readBlobGranules(const KeyRange& range, Version begin, Optional readVersion, Version* readVersionOut = nullptr); + Future>> summarizeBlobGranules(const KeyRange& range, + Optional summaryVersion, + int rangeLimit); + + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats); + // If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key void set(const KeyRef& key, const ValueRef& value, AddConflictRange = AddConflictRange::True); void atomicOp(const KeyRef& key, @@ -436,7 +462,13 @@ public: void fullReset(); double getBackoff(int errCode); - void debugTransaction(UID dID) { trState->debugID = dID; } + void debugTransaction(UID dID) { + if (trState->readOptions.present()) { + trState->readOptions.get().debugID = dID; + } else { + trState->readOptions = ReadOptions(dID); + } + } VersionVector getVersionVector() const; SpanContext getSpanContext() const { return trState->spanContext; } @@ -531,8 +563,9 @@ ACTOR Future> getCheckpointMetaData(Database cx, // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed ACTOR Future checkSafeExclusions(Database cx, std::vector exclusions); +// Round up to the nearest page size inline uint64_t getWriteOperationCost(uint64_t bytes) { - return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1; + return (bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1; } // Create a transaction to set the value of system key \xff/conf/perpetual_storage_wiggle. If enable == true, the value diff --git a/fdbclient/include/fdbclient/PImpl.h b/fdbclient/include/fdbclient/PImpl.h index 424761efe5..992753fa06 100644 --- a/fdbclient/include/fdbclient/PImpl.h +++ b/fdbclient/include/fdbclient/PImpl.h @@ -39,4 +39,6 @@ public: T const& operator*() const { return *impl; } T* operator->() { return impl.get(); } T const* operator->() const { return impl.get(); } + T* get() { return impl.get(); } + T const* get() const { return impl.get(); } }; diff --git a/fdbclient/include/fdbclient/RESTClient.h b/fdbclient/include/fdbclient/RESTClient.h index ba9719a9cc..cb8754ad9d 100644 --- a/fdbclient/include/fdbclient/RESTClient.h +++ b/fdbclient/include/fdbclient/RESTClient.h @@ -25,7 +25,7 @@ #pragma once #include "fdbclient/JSONDoc.h" -#include "fdbclient/HTTP.h" +#include "fdbrpc/HTTP.h" #include "fdbclient/RESTUtils.h" #include "flow/Arena.h" #include "flow/FastRef.h" diff --git a/fdbclient/include/fdbclient/ReadYourWrites.h b/fdbclient/include/fdbclient/ReadYourWrites.h index 89de979bc1..659f7768ae 100644 --- a/fdbclient/include/fdbclient/ReadYourWrites.h +++ b/fdbclient/include/fdbclient/ReadYourWrites.h @@ -20,6 +20,7 @@ #ifndef FDBCLIENT_READYOURWRITES_H #define FDBCLIENT_READYOURWRITES_H +#include "fdbclient/Status.h" #pragma once #include "fdbclient/NativeAPI.actor.h" @@ -120,12 +121,17 @@ public: Future>> getRangeSplitPoints(const KeyRange& range, int64_t chunkSize) override; Future getEstimatedRangeSizeBytes(const KeyRange& keys) override; - Future>> getBlobGranuleRanges(const KeyRange& range) override; + Future>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit) override; Future>> readBlobGranules(const KeyRange& range, Version begin, Optional readVersion, Version* readVersionOut) override; + Future>> summarizeBlobGranules(const KeyRange& range, + Optional summaryVersion, + int rangeLimit) override; + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) override; + void addReadConflictRange(KeyRangeRef const& keys) override; void makeSelfConflicting() override { tr.makeSelfConflicting(); } @@ -192,7 +198,17 @@ public: KeyRangeMap>>& getSpecialKeySpaceWriteMap() { return specialKeySpaceWriteMap; } bool readYourWritesDisabled() const { return options.readYourWritesDisabled; } const Optional& getSpecialKeySpaceErrorMsg() { return specialKeySpaceErrorMsg; } - void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; } + void setSpecialKeySpaceErrorMsg(const std::string& msg) { + if (g_network && g_network->isSimulated()) { + try { + readJSONStrictly(msg); + } catch (Error& e) { + TraceEvent(SevError, "InvalidSpecialKeySpaceErrorMessage").error(e).detail("Message", msg); + ASSERT(false); + } + } + specialKeySpaceErrorMsg = msg; + } Transaction& getTransaction() { return tr; } Optional getTenant() { return tr.getTenant(); } diff --git a/fdbclient/include/fdbclient/S3BlobStore.h b/fdbclient/include/fdbclient/S3BlobStore.h index 8649060b09..a4eba100c9 100644 --- a/fdbclient/include/fdbclient/S3BlobStore.h +++ b/fdbclient/include/fdbclient/S3BlobStore.h @@ -26,7 +26,7 @@ #include "flow/Net2Packet.h" #include "fdbclient/Knobs.h" #include "flow/IRateControl.h" -#include "fdbclient/HTTP.h" +#include "fdbrpc/HTTP.h" #include "fdbclient/JSONDoc.h" // Representation of all the things you need to connect to a blob store instance with some credentials. diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 1f4f6ee190..f4fc882ea3 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -50,7 +50,6 @@ public: bool PEEK_USING_STREAMING; double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin - double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification double TLOG_STORAGE_MIN_UPDATE_INTERVAL; double BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL; int DESIRED_TOTAL_BYTES; @@ -58,10 +57,6 @@ public: double UPDATE_DELAY; int MAXIMUM_PEEK_BYTES; int APPLY_MUTATION_BYTES; - int RECOVERY_DATA_BYTE_LIMIT; - int BUGGIFY_RECOVERY_DATA_LIMIT; - double LONG_TLOG_COMMIT_TIME; - int64_t LARGE_TLOG_COMMIT_BYTES; double BUGGIFY_RECOVER_MEMORY_LIMIT; double BUGGIFY_WORKER_REMOVED_MAX_LAG; int64_t UPDATE_STORAGE_BYTE_LIMIT; @@ -123,16 +118,16 @@ public: double BG_REBALANCE_POLLING_INTERVAL; double BG_REBALANCE_SWITCH_CHECK_INTERVAL; double DD_QUEUE_LOGGING_INTERVAL; + double DD_QUEUE_COUNTER_REFRESH_INTERVAL; + double DD_QUEUE_COUNTER_MAX_LOG; // max number of servers for which trace events will be generated in each round of + // DD_QUEUE_COUNTER_REFRESH_INTERVAL duration + bool DD_QUEUE_COUNTER_SUMMARIZE; // Enable summary of remaining servers when the number of servers with ongoing + // relocations in the last minute exceeds DD_QUEUE_COUNTER_MAX_LOG double RELOCATION_PARALLELISM_PER_SOURCE_SERVER; double RELOCATION_PARALLELISM_PER_DEST_SERVER; int DD_QUEUE_MAX_KEY_SERVERS; int DD_REBALANCE_PARALLELISM; int DD_REBALANCE_RESET_AMOUNT; - double BG_DD_MAX_WAIT; - double BG_DD_MIN_WAIT; - double BG_DD_INCREASE_RATE; - double BG_DD_DECREASE_RATE; - double BG_DD_SATURATION_DELAY; double INFLIGHT_PENALTY_HEALTHY; double INFLIGHT_PENALTY_REDUNDANT; double INFLIGHT_PENALTY_UNHEALTHY; @@ -161,8 +156,15 @@ public: int PRIORITY_TEAM_FAILED; // Priority when a server in the team is excluded as failed int PRIORITY_TEAM_0_LEFT; int PRIORITY_SPLIT_SHARD; + int PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD; // Priority when a physical shard is oversize or anonymous // Data distribution + bool SHARD_ENCODE_LOCATION_METADATA; // If true, location metadata will contain shard ID. + bool ENABLE_DD_PHYSICAL_SHARD; // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true. + int64_t MAX_PHYSICAL_SHARD_BYTES; + double PHYSICAL_SHARD_METRICS_DELAY; + double ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME; + double READ_REBALANCE_CPU_THRESHOLD; // read rebalance only happens if the source servers' CPU > threshold int READ_REBALANCE_SRC_PARALLELISM; // the max count a server become a source server within a certain interval int READ_REBALANCE_SHARD_TOPK; // top k shards from which to select randomly for read-rebalance @@ -193,7 +195,6 @@ public: double SERVER_LIST_DELAY; double RECRUITMENT_IDLE_DELAY; double STORAGE_RECRUITMENT_DELAY; - double BLOB_WORKER_RECRUITMENT_DELAY; bool TSS_HACK_IDENTITY_MAPPING; double TSS_RECRUITMENT_TIMEOUT; double TSS_DD_CHECK_INTERVAL; @@ -232,6 +233,8 @@ public: int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY; int DD_STORAGE_WIGGLE_PAUSE_THRESHOLD; // How many unhealthy relocations are ongoing will pause storage wiggle int DD_STORAGE_WIGGLE_STUCK_THRESHOLD; // How many times bestTeamStuck accumulate will pause storage wiggle + int64_t + DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled bool DD_TENANT_AWARENESS_ENABLED; int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed @@ -253,9 +256,8 @@ public: // Run storage enginee on a child process on the same machine with storage process bool REMOTE_KV_STORE; - // A delay to avoid race on file resources if the new kv store process started immediately after the previous kv - // store process died - double REMOTE_KV_STORE_INIT_DELAY; + // A delay to avoid race on file resources after seeing lock_file_failure + double REBOOT_KV_STORE_DELAY; // max waiting time for the remote kv store to initialize double REMOTE_KV_STORE_MAX_INIT_DURATION; @@ -300,9 +302,13 @@ public: int64_t REPLACE_CONTENTS_BYTES; // KeyValueStoreRocksDB + bool ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES; + int ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE; + int ROCKSDB_READ_RANGE_ROW_LIMIT; int ROCKSDB_BACKGROUND_PARALLELISM; int ROCKSDB_READ_PARALLELISM; int64_t ROCKSDB_MEMTABLE_BYTES; + bool ROCKSDB_LEVEL_STYLE_COMPACTION; bool ROCKSDB_UNSAFE_AUTO_FSYNC; int64_t ROCKSDB_PERIODIC_COMPACTION_SECONDS; int ROCKSDB_PREFIX_LEN; @@ -324,8 +330,10 @@ public: int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC; bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE; std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY; + bool ROCKSDB_DISABLE_AUTO_COMPACTIONS; bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead double ROCKSDB_PERFCONTEXT_SAMPLE_RATE; + double ROCKSDB_METRICS_SAMPLE_INTERVAL; int ROCKSDB_MAX_SUBCOMPACTIONS; int64_t ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT; int64_t ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT; @@ -335,6 +343,12 @@ public: int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE; int64_t ROCKSDB_BLOCK_SIZE; bool ENABLE_SHARDED_ROCKSDB; + int64_t ROCKSDB_WRITE_BUFFER_SIZE; + int64_t ROCKSDB_CF_WRITE_BUFFER_SIZE; + int64_t ROCKSDB_MAX_TOTAL_WAL_SIZE; + int64_t ROCKSDB_MAX_BACKGROUND_JOBS; + int64_t ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD; + double ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY; // Leader election int MAX_NOTIFICATIONS; @@ -399,6 +413,10 @@ public: double RESET_MASTER_DELAY; double RESET_RESOLVER_DELAY; + double GLOBAL_CONFIG_MIGRATE_TIMEOUT; + double GLOBAL_CONFIG_REFRESH_INTERVAL; + double GLOBAL_CONFIG_REFRESH_TIMEOUT; + // Master Server double COMMIT_SLEEP_TIME; double MIN_BALANCE_TIME; @@ -443,6 +461,7 @@ public: double ATTEMPT_RECRUITMENT_DELAY; double WAIT_FOR_DISTRIBUTOR_JOIN_DELAY; double WAIT_FOR_RATEKEEPER_JOIN_DELAY; + double WAIT_FOR_CONSISTENCYSCAN_JOIN_DELAY; double WAIT_FOR_BLOB_MANAGER_JOIN_DELAY; double WAIT_FOR_ENCRYPT_KEY_PROXY_JOIN_DELAY; double WORKER_FAILURE_TIME; @@ -456,6 +475,7 @@ public: double CHECK_REMOTE_HEALTH_INTERVAL; // Remote DC health refresh interval. double FORCE_RECOVERY_CHECK_DELAY; double RATEKEEPER_FAILURE_TIME; + double CONSISTENCYSCAN_FAILURE_TIME; double BLOB_MANAGER_FAILURE_TIME; double REPLACE_INTERFACE_DELAY; double REPLACE_INTERFACE_CHECK_DELAY; @@ -542,6 +562,10 @@ public: double RATEKEEPER_DEFAULT_LIMIT; double RATEKEEPER_LIMIT_REASON_SAMPLE_RATE; bool RATEKEEPER_PRINT_LIMIT_REASON; + double RATEKEEPER_MIN_RATE; + double RATEKEEPER_MAX_RATE; + double RATEKEEPER_BATCH_MIN_RATE; + double RATEKEEPER_BATCH_MAX_RATE; int64_t TARGET_BYTES_PER_STORAGE_SERVER; int64_t SPRING_BYTES_STORAGE_SERVER; @@ -550,9 +574,12 @@ public: int64_t SPRING_BYTES_STORAGE_SERVER_BATCH; int64_t STORAGE_HARD_LIMIT_BYTES; int64_t STORAGE_HARD_LIMIT_BYTES_OVERAGE; + int64_t STORAGE_HARD_LIMIT_BYTES_SPEED_UP_SIM; + int64_t STORAGE_HARD_LIMIT_BYTES_OVERAGE_SPEED_UP_SIM; int64_t STORAGE_HARD_LIMIT_VERSION_OVERAGE; int64_t STORAGE_DURABILITY_LAG_HARD_MAX; int64_t STORAGE_DURABILITY_LAG_SOFT_MAX; + bool STORAGE_INCLUDE_FEED_STORAGE_QUEUE; int64_t LOW_PRIORITY_STORAGE_QUEUE_BYTES; int64_t LOW_PRIORITY_DURABILITY_LAG; @@ -585,11 +612,14 @@ public: // Use global tag throttling strategy. i.e. throttle based on the cluster-wide // throughput for tags and their associated quotas. bool GLOBAL_TAG_THROTTLING; + // Enforce tag throttling on proxies rather than on clients + bool ENFORCE_TAG_THROTTLING_ON_PROXIES; // Minimum number of transactions per second that the global tag throttler must allow for each tag double GLOBAL_TAG_THROTTLING_MIN_RATE; // Used by global tag throttling counters double GLOBAL_TAG_THROTTLING_FOLDING_TIME; - double GLOBAL_TAG_THROTTLING_TRACE_INTERVAL; + // Cost multiplier for writes (because write operations are more expensive than reads) + double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO; double MAX_TRANSACTIONS_PER_BYTE; @@ -612,8 +642,19 @@ public: double INITIAL_DURABILITY_LAG_MULTIPLIER; double DURABILITY_LAG_REDUCTION_RATE; double DURABILITY_LAG_INCREASE_RATE; - double STORAGE_SERVER_LIST_FETCH_TIMEOUT; + bool BW_THROTTLING_ENABLED; + double TARGET_BW_LAG; + double TARGET_BW_LAG_BATCH; + double TARGET_BW_LAG_UPDATE; + int MIN_BW_HISTORY; + double BW_ESTIMATION_INTERVAL; + double BW_LAG_INCREASE_AMOUNT; + double BW_LAG_DECREASE_AMOUNT; + double BW_FETCH_WORKERS_INTERVAL; + double BW_RW_LOGGING_INTERVAL; + double BW_MAX_BLOCKED_INTERVAL; + double BW_RK_SIM_QUIESCE_DELAY; // disk snapshot int64_t MAX_FORKED_PROCESS_OUTPUT; @@ -652,12 +693,14 @@ public: int STORAGE_LIMIT_BYTES; int BUGGIFY_LIMIT_BYTES; bool FETCH_USING_STREAMING; + bool FETCH_USING_BLOB; int FETCH_BLOCK_BYTES; int FETCH_KEYS_PARALLELISM_BYTES; int FETCH_KEYS_PARALLELISM; + int FETCH_KEYS_PARALLELISM_FULL; int FETCH_KEYS_LOWER_PRIORITY; - int FETCH_CHANGEFEED_PARALLELISM; int SERVE_FETCH_CHECKPOINT_PARALLELISM; + int CHANGE_FEED_DISK_READS_PARALLELISM; int BUGGIFY_BLOCK_BYTES; int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT; double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD; @@ -665,7 +708,6 @@ public: int STORAGE_COMMIT_BYTES; int STORAGE_FETCH_BYTES; double STORAGE_COMMIT_INTERVAL; - double UPDATE_SHARD_VERSION_INTERVAL; int BYTE_SAMPLING_FACTOR; int BYTE_SAMPLING_OVERHEAD; int MAX_STORAGE_SERVER_WATCH_BYTES; @@ -674,7 +716,6 @@ public: int BYTE_SAMPLE_LOAD_PARALLELISM; double BYTE_SAMPLE_LOAD_DELAY; double BYTE_SAMPLE_START_DELAY; - double UPDATE_STORAGE_PROCESS_STATS_INTERVAL; double BEHIND_CHECK_DELAY; int BEHIND_CHECK_COUNT; int64_t BEHIND_CHECK_VERSIONS; @@ -698,6 +739,7 @@ public: int CHECKPOINT_TRANSFER_BLOCK_BYTES; int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; + int STORAGE_FEED_QUERY_HARD_LIMIT; // Wait Failure int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; @@ -731,6 +773,12 @@ public: bool WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER; // When enabled, the worker's health monitor also report any recent // destroyed peers who are part of the transaction system to // cluster controller. + bool STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT; // When enabled, storage server's worker will crash on io_timeout error; + // this allows fdbmonitor to restart the worker and recreate the same SS. + // When SS can be temporarily throttled by infrastructure, e.g, k8s, + // Enabling this can reduce toil of manually restarting the SS. + // Enable with caution: If io_timeout is caused by disk failure, we won't + // want to restart the SS, which increases risk of data corruption. // Test harness double WORKER_POLL_DELAY; @@ -744,7 +792,7 @@ public: // Dynamic Knobs (implementation) double COMPACTION_INTERVAL; - double UPDATE_NODE_TIMEOUT; + double BROADCASTER_SELF_UPDATE_DELAY; double GET_COMMITTED_VERSION_TIMEOUT; double GET_SNAPSHOT_AND_CHANGES_TIMEOUT; double FETCH_CHANGES_TIMEOUT; @@ -760,14 +808,6 @@ public: bool DISABLE_DUPLICATE_LOG_WARNING; double HISTOGRAM_REPORT_INTERVAL; - // IPager - int PAGER_RESERVED_PAGES; - - // IndirectShadowPager - int FREE_PAGE_VACUUM_THRESHOLD; - int VACUUM_QUEUE_SIZE; - int VACUUM_BYTES_PER_SECOND; - // Timekeeper int64_t TIME_KEEPER_DELAY; int64_t TIME_KEEPER_MAX_ENTRIES; @@ -790,11 +830,9 @@ public: int64_t FASTRESTORE_ROLE_LOGGING_DELAY; int64_t FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL; // How quickly to update process metrics for restore int64_t FASTRESTORE_ATOMICOP_WEIGHT; // workload amplication factor for atomic op - int64_t FASTRESTORE_APPLYING_PARALLELISM; // number of outstanding txns writing to dest. DB int64_t FASTRESTORE_MONITOR_LEADER_DELAY; int64_t FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS; bool FASTRESTORE_TRACK_REQUEST_LATENCY; // true to track reply latency of each request in a request batch - bool FASTRESTORE_TRACK_LOADER_SEND_REQUESTS; // track requests of load send mutations to appliers? int64_t FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT; // threshold when pipelined actors should be delayed int64_t FASTRESTORE_WAIT_FOR_MEMORY_LATENCY; int64_t FASTRESTORE_HEARTBEAT_DELAY; // interval for master to ping loaders and appliers @@ -863,6 +901,7 @@ public: int SIM_KMS_MAX_KEYS; int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH; bool ENABLE_TLOG_ENCRYPTION; + bool ENABLE_STORAGE_SERVER_ENCRYPTION; // Currently only Redwood engine supports encryption bool ENABLE_BLOB_GRANULE_ENCRYPTION; // Compression @@ -876,14 +915,13 @@ public: // FIXME: configure url with database configuration instead of knob eventually std::string BG_URL; - // whether to use blobRangeKeys or tenants for blob granule range sources - std::string BG_RANGE_SOURCE; // Whether to use knobs or EKP for blob metadata and credentials std::string BG_METADATA_SOURCE; int BG_SNAPSHOT_FILE_TARGET_BYTES; - int BG_SNAPSHOT_FILE_TARGET_CHUNKS; + int BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES; int BG_DELTA_FILE_TARGET_BYTES; + int BG_DELTA_FILE_TARGET_CHUNK_BYTES; int BG_DELTA_BYTES_BEFORE_COMPACT; int BG_MAX_SPLIT_FANOUT; int BG_MAX_MERGE_FANIN; @@ -892,18 +930,29 @@ public: int BG_CONSISTENCY_CHECK_TARGET_SPEED_KB; bool BG_ENABLE_MERGING; int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS; + int BG_MERGE_CANDIDATE_DELAY_SECONDS; + int BG_KEY_TUPLE_TRUNCATE_OFFSET; int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM; + int BLOB_WORKER_RESNAPSHOT_PARALLELISM; + int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM; + double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout double BLOB_WORKERLIST_FETCH_INTERVAL; double BLOB_WORKER_BATCH_GRV_INTERVAL; + bool BLOB_WORKER_DO_REJECT_WHEN_FULL; + double BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD; + double BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY; double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN; double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX; double BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT; + int BLOB_MANAGER_CONCURRENT_MERGE_CHECKS; double BGCC_TIMEOUT; double BGCC_MIN_INTERVAL; + bool BLOB_MANIFEST_BACKUP; + bool BLOB_FULL_RESTORE_MODE; // Blob metadata int64_t BLOB_METADATA_CACHE_TTL; diff --git a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h index 3a2c7f6b83..d2ce7f5cf9 100644 --- a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h @@ -60,6 +60,8 @@ public: // TODO : give this function a more descriptive name virtual bool isAsync() const { return false; } + virtual bool supportsTenants() const { return false; } + virtual ~SpecialKeyRangeReadImpl() {} protected: @@ -125,7 +127,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint, - Optional* cache) const { + KeyRangeMap>* cache) const { return getRangeAsyncActor(this, ryw, kr, limitsHint, cache); } @@ -135,17 +137,18 @@ public: ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limits, - Optional* cache) { + KeyRangeMap>* cache) { ASSERT(skrAyncImpl->getKeyRange().contains(kr)); ASSERT(cache != nullptr); - if (!cache->present()) { + ASSERT(cache->rangeContaining(kr.begin) == cache->rangeContainingKeyBefore(kr.end)); + if (!(*cache)[kr.begin].present()) { // For simplicity, every time we need to cache, we read the whole range // Although sometimes the range can be narrowed, // there is not a general way to do it in complicated scenarios RangeResult result_ = wait(skrAyncImpl->getRange(ryw, skrAyncImpl->getKeyRange(), limits)); - *cache = result_; + cache->insert(skrAyncImpl->getKeyRange(), result_); } - const auto& allResults = cache->get(); + const auto& allResults = (*cache)[kr.begin].get(); int start = 0, end = allResults.size(); while (start < allResults.size() && allResults[start].key < kr.begin) ++start; @@ -271,15 +274,23 @@ private: }; // Used for SpecialKeySpaceCorrectnessWorkload -class SKSCTestImpl : public SpecialKeyRangeRWImpl { +class SKSCTestRWImpl : public SpecialKeyRangeRWImpl { public: - explicit SKSCTestImpl(KeyRangeRef kr); + explicit SKSCTestRWImpl(KeyRangeRef kr); Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; Future> commit(ReadYourWritesTransaction* ryw) override; }; +class SKSCTestAsyncReadImpl : public SpecialKeyRangeAsyncImpl { +public: + explicit SKSCTestAsyncReadImpl(KeyRangeRef kr); + Future getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const override; +}; + // Use special key prefix "\xff\xff/transaction/conflicting_keys/", // to retrieve keys which caused latest not_committed(conflicting with another transaction) error. // The returned key value pairs are interpreted as : @@ -292,6 +303,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; + bool supportsTenants() const override { return true; }; }; class ReadConflictRangeImpl : public SpecialKeyRangeReadImpl { @@ -300,6 +312,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; + bool supportsTenants() const override { return true; }; }; class WriteConflictRangeImpl : public SpecialKeyRangeReadImpl { @@ -308,6 +321,7 @@ public: Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; + bool supportsTenants() const override { return true; }; }; class DDStatsRangeImpl : public SpecialKeyRangeAsyncImpl { @@ -539,5 +553,24 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; }; +class WorkerInterfacesSpecialKeyImpl : public SpecialKeyRangeReadImpl { +public: + explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr); + + Future getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const override; +}; + +// If the underlying set of key-value pairs of a key space is not changing, then we expect repeating a read to give the +// same result. Additionally, we can generate the expected result of any read if that read is reading a subrange. This +// actor performs a read of an arbitrary subrange of [begin, end) and validates the results. +ACTOR Future validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw, + KeySelector begin, + KeySelector end, + GetRangeLimits limits, + Reverse reverse, + RangeResult result); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index febbc1311b..6b3e64d0aa 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -22,15 +22,16 @@ #define FDBCLIENT_STORAGESERVERINTERFACE_H #pragma once -#include #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageCheckpoint.h" +#include "fdbclient/StorageServerShard.h" #include "fdbrpc/Locality.h" #include "fdbrpc/QueueModel.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/LoadBalance.actor.h" #include "fdbrpc/Stats.h" #include "fdbrpc/TimedRequest.h" +#include "fdbrpc/TenantInfo.h" #include "fdbrpc/TSSComparison.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/TagThrottle.actor.h" @@ -53,6 +54,34 @@ struct VersionReply { } }; +// This struct is used by RK to forward the commit cost to SS, see discussion in #7258 +struct UpdateCommitCostRequest { + constexpr static FileIdentifier file_identifier = 4159439; + + // Ratekeeper ID, it is only reasonable to compare postTime from the same Ratekeeper + UID ratekeeperID; + + // The time the request being posted + double postTime; + + double elapsed; + TransactionTag busiestTag; + + // Properties that are defined in TransactionCommitCostEstimation + int opsSum; + uint64_t costSum; + + uint64_t totalWriteCosts; + bool reported; + + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, ratekeeperID, postTime, elapsed, busiestTag, opsSum, costSum, totalWriteCosts, reported, reply); + } +}; + struct StorageServerInterface { constexpr static FileIdentifier file_identifier = 15302073; enum { BUSY_ALLOWED = 0, BUSY_FORCE = 1, BUSY_LOCAL = 2 }; @@ -84,13 +113,15 @@ struct StorageServerInterface { RequestStream getReadHotRanges; RequestStream getRangeSplitPoints; PublicRequestStream getKeyValuesStream; - PublicRequestStream changeFeedStream; - PublicRequestStream overlappingChangeFeeds; - PublicRequestStream changeFeedPop; - PublicRequestStream changeFeedVersionUpdate; - PublicRequestStream checkpoint; - PublicRequestStream fetchCheckpoint; - PublicRequestStream fetchCheckpointKeyValues; + RequestStream changeFeedStream; + RequestStream overlappingChangeFeeds; + RequestStream changeFeedPop; + RequestStream changeFeedVersionUpdate; + RequestStream checkpoint; + RequestStream fetchCheckpoint; + RequestStream fetchCheckpointKeyValues; + + RequestStream updateCommitCostRequest; private: bool acceptingRequests; @@ -149,19 +180,20 @@ public: getMappedKeyValues = PublicRequestStream( getValue.getEndpoint().getAdjustedEndpoint(14)); changeFeedStream = - PublicRequestStream(getValue.getEndpoint().getAdjustedEndpoint(15)); - overlappingChangeFeeds = PublicRequestStream( - getValue.getEndpoint().getAdjustedEndpoint(16)); + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(15)); + overlappingChangeFeeds = + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(16)); changeFeedPop = - PublicRequestStream(getValue.getEndpoint().getAdjustedEndpoint(17)); - changeFeedVersionUpdate = PublicRequestStream( + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(17)); + changeFeedVersionUpdate = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(18)); - checkpoint = - PublicRequestStream(getValue.getEndpoint().getAdjustedEndpoint(19)); + checkpoint = RequestStream(getValue.getEndpoint().getAdjustedEndpoint(19)); fetchCheckpoint = - PublicRequestStream(getValue.getEndpoint().getAdjustedEndpoint(20)); - fetchCheckpointKeyValues = PublicRequestStream( + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(20)); + fetchCheckpointKeyValues = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(21)); + updateCommitCostRequest = + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(22)); } } else { ASSERT(Ar::isDeserializing); @@ -212,6 +244,7 @@ public: streams.push_back(checkpoint.getReceiver()); streams.push_back(fetchCheckpoint.getReceiver()); streams.push_back(fetchCheckpointKeyValues.getReceiver()); + streams.push_back(updateCommitCostRequest.getReceiver()); FlowTransport::transport().addEndpoints(streams); } }; @@ -241,21 +274,6 @@ struct ServerCacheInfo { } }; -struct TenantInfo { - static const int64_t INVALID_TENANT = -1; - - Optional name; - int64_t tenantId; - - TenantInfo() : tenantId(INVALID_TENANT) {} - TenantInfo(TenantName name, int64_t tenantId) : name(name), tenantId(tenantId) {} - - template - void serialize(Ar& ar) { - serializer(ar, name, tenantId); - } -}; - struct GetValueReply : public LoadBalancedReply { constexpr static FileIdentifier file_identifier = 1378929; Optional value; @@ -277,26 +295,28 @@ struct GetValueRequest : TimedRequest { Key key; Version version; Optional tags; - Optional debugID; ReplyPromise reply; + Optional options; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - GetValueRequest() {} + + bool verify() const { return tenantInfo.isAuthorized(); } + GetValueRequest(SpanContext spanContext, const TenantInfo& tenantInfo, const Key& key, Version ver, Optional tags, - Optional debugID, + Optional options, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), debugID(debugID), + : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), options(options), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, key, version, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions); + serializer(ar, key, version, tags, reply, spanContext, tenantInfo, options, ssLatestCommitVersions); } }; @@ -337,6 +357,8 @@ struct WatchValueRequest { : spanContext(spanContext), tenantInfo(tenantInfo), key(key), value(value), version(ver), tags(tags), debugID(debugID) {} + bool verify() const { return tenantInfo.isAuthorized(); } + template void serialize(Ar& ar) { serializer(ar, key, value, version, tags, debugID, reply, spanContext, tenantInfo); @@ -370,15 +392,16 @@ struct GetKeyValuesRequest : TimedRequest { KeyRef mapper = KeyRef(); Version version; // or latestVersion int limit, limitBytes; - bool isFetchKeys; Optional tags; - Optional debugID; + Optional options; ReplyPromise reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - GetKeyValuesRequest() : isFetchKeys(false) {} + GetKeyValuesRequest() {} + + bool verify() const { return tenantInfo.isAuthorized(); } template void serialize(Ar& ar) { @@ -388,14 +411,13 @@ struct GetKeyValuesRequest : TimedRequest { version, limit, limitBytes, - isFetchKeys, tags, - debugID, reply, spanContext, tenantInfo, - arena, - ssLatestCommitVersions); + options, + ssLatestCommitVersions, + arena); } }; @@ -427,15 +449,17 @@ struct GetMappedKeyValuesRequest : TimedRequest { Version version; // or latestVersion int limit, limitBytes; int matchIndex; - bool isFetchKeys; Optional tags; - Optional debugID; + Optional options; ReplyPromise reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key range - GetMappedKeyValuesRequest() : isFetchKeys(false) {} + GetMappedKeyValuesRequest() {} + + bool verify() const { return tenantInfo.isAuthorized(); } + template void serialize(Ar& ar) { serializer(ar, @@ -445,15 +469,14 @@ struct GetMappedKeyValuesRequest : TimedRequest { version, limit, limitBytes, - isFetchKeys, tags, - debugID, reply, spanContext, tenantInfo, - arena, + options, ssLatestCommitVersions, - matchIndex); + matchIndex, + arena); } }; @@ -492,15 +515,16 @@ struct GetKeyValuesStreamRequest { KeySelectorRef begin, end; Version version; // or latestVersion int limit, limitBytes; - bool isFetchKeys; Optional tags; - Optional debugID; + Optional options; ReplyPromiseStream reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key range - GetKeyValuesStreamRequest() : isFetchKeys(false) {} + GetKeyValuesStreamRequest() {} + + bool verify() const { return tenantInfo.isAuthorized(); } template void serialize(Ar& ar) { @@ -510,14 +534,13 @@ struct GetKeyValuesStreamRequest { version, limit, limitBytes, - isFetchKeys, tags, - debugID, reply, spanContext, tenantInfo, - arena, - ssLatestCommitVersions); + options, + ssLatestCommitVersions, + arena); } }; @@ -543,27 +566,29 @@ struct GetKeyRequest : TimedRequest { KeySelectorRef sel; Version version; // or latestVersion Optional tags; - Optional debugID; ReplyPromise reply; + Optional options; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key GetKeyRequest() {} + bool verify() const { return tenantInfo.isAuthorized(); } + GetKeyRequest(SpanContext spanContext, TenantInfo tenantInfo, KeySelectorRef const& sel, Version version, Optional tags, - Optional debugID, + Optional options, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), debugID(debugID), + : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), tags(tags), options(options), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, sel, version, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions); + serializer(ar, sel, version, tags, reply, spanContext, tenantInfo, options, ssLatestCommitVersions, arena); } }; @@ -572,12 +597,13 @@ struct GetShardStateReply { Version first; Version second; + std::vector shards; GetShardStateReply() = default; GetShardStateReply(Version first, Version second) : first(first), second(second) {} template void serialize(Ar& ar) { - serializer(ar, first, second); + serializer(ar, first, second, shards); } }; @@ -587,13 +613,16 @@ struct GetShardStateRequest { KeyRange keys; int32_t mode; + bool includePhysicalShard; ReplyPromise reply; - GetShardStateRequest() {} - GetShardStateRequest(KeyRange const& keys, waitMode mode) : keys(keys), mode(mode) {} + GetShardStateRequest() = default; + GetShardStateRequest(KeyRange const& keys, waitMode mode, bool includePhysicalShard) + : keys(keys), mode(mode), includePhysicalShard(includePhysicalShard) {} + GetShardStateRequest(KeyRange const& keys, waitMode mode) : keys(keys), mode(mode), includePhysicalShard(false) {} template void serialize(Ar& ar) { - serializer(ar, keys, mode, reply); + serializer(ar, keys, mode, reply, includePhysicalShard); } }; @@ -605,7 +634,6 @@ struct StorageMetrics { int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s) int64_t iosPerKSecond = 0; int64_t bytesReadPerKSecond = 0; - Optional keys; // this metric belongs to which range static const int64_t infinity = 1LL << 60; @@ -730,7 +758,7 @@ struct SplitMetricsRequest { template void serialize(Ar& ar) { - serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena, minSplitBytes); + serializer(ar, keys, limits, used, estimated, isLastShard, reply, minSplitBytes, arena); } }; @@ -966,39 +994,51 @@ struct FetchCheckpointKeyValuesRequest { }; struct OverlappingChangeFeedEntry { - Key rangeId; - KeyRange range; + KeyRef feedId; + KeyRangeRef range; Version emptyVersion; Version stopVersion; + Version feedMetadataVersion; bool operator==(const OverlappingChangeFeedEntry& r) const { - return rangeId == r.rangeId && range == r.range && emptyVersion == r.emptyVersion && - stopVersion == r.stopVersion; + return feedId == r.feedId && range == r.range && emptyVersion == r.emptyVersion && + stopVersion == r.stopVersion && feedMetadataVersion == r.feedMetadataVersion; } OverlappingChangeFeedEntry() {} - OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, Version emptyVersion, Version stopVersion) - : rangeId(rangeId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion) {} + OverlappingChangeFeedEntry(KeyRef const& feedId, + KeyRangeRef const& range, + Version emptyVersion, + Version stopVersion, + Version feedMetadataVersion) + : feedId(feedId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion), + feedMetadataVersion(feedMetadataVersion) {} + + OverlappingChangeFeedEntry(Arena& arena, const OverlappingChangeFeedEntry& rhs) + : feedId(arena, rhs.feedId), range(arena, rhs.range), emptyVersion(rhs.emptyVersion), + stopVersion(rhs.stopVersion), feedMetadataVersion(rhs.feedMetadataVersion) {} template void serialize(Ar& ar) { - serializer(ar, rangeId, range, emptyVersion, stopVersion); + serializer(ar, feedId, range, emptyVersion, stopVersion, feedMetadataVersion); } }; struct OverlappingChangeFeedsReply { constexpr static FileIdentifier file_identifier = 11815134; - std::vector rangeIds; + VectorRef feeds; bool cached; Arena arena; + Version feedMetadataVersion; - OverlappingChangeFeedsReply() : cached(false) {} - explicit OverlappingChangeFeedsReply(std::vector const& rangeIds) - : rangeIds(rangeIds), cached(false) {} + OverlappingChangeFeedsReply() : cached(false), feedMetadataVersion(invalidVersion) {} + explicit OverlappingChangeFeedsReply(VectorRef const& feeds, + Version feedMetadataVersion) + : feeds(feeds), cached(false), feedMetadataVersion(feedMetadataVersion) {} template void serialize(Ar& ar) { - serializer(ar, rangeIds, arena); + serializer(ar, feeds, feedMetadataVersion, arena); } }; diff --git a/fdbclient/include/fdbclient/StorageServerShard.h b/fdbclient/include/fdbclient/StorageServerShard.h new file mode 100644 index 0000000000..dce0d5a6fa --- /dev/null +++ b/fdbclient/include/fdbclient/StorageServerShard.h @@ -0,0 +1,88 @@ +/* + * StorageServerShard.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_STORAGESERVERSHARD_H +#define FDBCLIENT_STORAGESERVERSHARD_H +#pragma once + +#include "fdbclient/FDBTypes.h" +#include "flow/flow.h" + +// Represents a data shard on a storage server hosting a continuous keyrange. +struct StorageServerShard { + constexpr static FileIdentifier file_identifier = 4028358; + + enum ShardState { + NotAssigned = 0, + MovingIn = 1, + ReadWritePending = 2, + ReadWrite = 3, + }; + + StorageServerShard() = default; + StorageServerShard(KeyRange range, + Version version, + const uint64_t id, + const uint64_t desiredId, + ShardState shardState) + : range(range), version(version), id(id), desiredId(desiredId), shardState(shardState) {} + + static StorageServerShard notAssigned(KeyRange range, Version version = 0) { + return StorageServerShard(range, version, 0, 0, NotAssigned); + } + + ShardState getShardState() const { return static_cast(this->shardState); }; + + void setShardState(const ShardState shardState) { this->shardState = static_cast(shardState); } + + std::string getShardStateString() const { + const ShardState ss = getShardState(); + switch (ss) { + case NotAssigned: + return "NotAssigned"; + case MovingIn: + return "MovingIn"; + case ReadWritePending: + return "ReadWritePending"; + case ReadWrite: + return "ReadWrite"; + } + return "InvalidState"; + } + + std::string toString() const { + return "StorageServerShard: [Range]: " + Traceable::toString(range) + + " [Shard ID]: " + format("%016llx", this->id) + " [Version]: " + std::to_string(version) + + " [State]: " + getShardStateString() + " [Desired Shard ID]: " + format("%016llx", this->desiredId); + } + + template + void serialize(Ar& ar) { + serializer(ar, range, version, id, desiredId, shardState); + } + + KeyRange range; + Version version; // Shard creation version. + uint64_t id; // The actual shard ID. + uint64_t desiredId; // The intended shard ID. + int8_t shardState; +}; + +#endif diff --git a/fdbclient/include/fdbclient/Subspace.h b/fdbclient/include/fdbclient/Subspace.h index ef88fc3855..0cc85089d1 100644 --- a/fdbclient/include/fdbclient/Subspace.h +++ b/fdbclient/include/fdbclient/Subspace.h @@ -42,9 +42,7 @@ public: template Key pack(T const& item) const { - Tuple t; - t.append(item); - return pack(t); + return pack(Tuple::makeTuple(item)); } Key pack(StringRef const& item, bool utf8 = false) const { @@ -58,9 +56,7 @@ public: template Subspace get(T const& item) const { - Tuple t; - t.append(item); - return get(t); + return get(Tuple::makeTuple(item)); } Subspace get(StringRef const& item, bool utf8 = false) const { diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 370c1e1f29..068d1d9d37 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -28,7 +28,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobWorkerInterface.h" // TODO move the functions that depend on this out of here and into BlobWorkerInterface.h to remove this depdendency #include "fdbclient/StorageServerInterface.h" -#include "Tenant.h" +#include "fdbclient/Tenant.h" // Don't warn on constants being defined in this file. #pragma clang diagnostic push @@ -163,6 +163,9 @@ extern const KeyRef cacheChangePrefix; const Key cacheChangeKeyFor(uint16_t idx); uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key); +// For persisting the consistency scan configuration and metrics +extern const KeyRef consistencyScanInfoKey; + // "\xff/tss/[[serverId]]" := "[[tssId]]" extern const KeyRangeRef tssMappingKeys; @@ -273,6 +276,9 @@ extern const KeyRef perpetualStorageWiggleStatsPrefix; // Change the value of this key to anything and that will trigger detailed data distribution team info log. extern const KeyRef triggerDDTeamInfoPrintKey; +// Encryption data at-rest config key +extern const KeyRef encryptionAtRestModeConfKey; + // The differences between excluded and failed can be found in "command-line-interface.rst" // and in the help message of the fdbcli command "exclude". @@ -374,6 +380,12 @@ std::vector> decodeBackupStartedValue(const ValueRef& va // 1 = Send a signal to pause/already paused. extern const KeyRef backupPausedKey; +// "\xff/previousCoordinators" = "[[ClusterConnectionString]]" +// Set to the encoded structure of the cluster's previous set of coordinators. +// Changed when performing quorumChange. +// See "CoordinationInterface.h" struct ClusterConnectionString for more details +extern const KeyRef previousCoordinatorsKey; + // "\xff/coordinators" = "[[ClusterConnectionString]]" // Set to the encoded structure of the cluster's current set of coordinators. // Changed when performing quorumChange. @@ -594,6 +606,8 @@ const Value blobManagerEpochValueFor(int64_t epoch); int64_t decodeBlobManagerEpochValue(ValueRef const& value); // blob granule keys +extern const StringRef blobRangeActive; +extern const StringRef blobRangeInactive; extern const uint8_t BG_FILE_TYPE_DELTA; extern const uint8_t BG_FILE_TYPE_SNAPSHOT; @@ -613,12 +627,16 @@ extern const KeyRangeRef blobGranuleSplitKeys; // \xff\x02/bgmerge/mergeGranuleId = [[BlobGranuleMergeState]] extern const KeyRangeRef blobGranuleMergeKeys; +// \xff\x02/bgmergebounds/beginkey = [[BlobGranuleMergeBoundary]] +extern const KeyRangeRef blobGranuleMergeBoundaryKeys; + // \xff\x02/bgh/(beginKey,endKey,startVersion) = { granuleUID, [parentGranuleHistoryKeys] } extern const KeyRangeRef blobGranuleHistoryKeys; // \xff\x02/bgp/(start,end) = (version, force) extern const KeyRangeRef blobGranulePurgeKeys; -extern const KeyRangeRef blobGranuleVersionKeys; +// \xff\x02/bgpforce/(start) = {1|0} (key range map) +extern const KeyRangeRef blobGranuleForcePurgedKeys; extern const KeyRef blobGranulePurgeChangeKey; const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType); @@ -658,11 +676,16 @@ std::pair decodeBlobGranuleSplitValue(ValueRef c const Value blobGranuleMergeValueFor(KeyRange mergeKeyRange, std::vector parentGranuleIDs, - std::vector parentGranuleRanges, + std::vector parentGranuleRanges, std::vector parentGranuleStartVersions); // FIXME: probably just define object type for this? -std::tuple, std::vector, std::vector> -decodeBlobGranuleMergeValue(ValueRef const& value); +std::tuple, std::vector, std::vector> decodeBlobGranuleMergeValue( + ValueRef const& value); + +// BlobGranuleMergeBoundary. +const Key blobGranuleMergeBoundaryKeyFor(const KeyRef& key); +const Value blobGranuleMergeBoundaryValueFor(BlobGranuleMergeBoundary const& boundary); +Standalone decodeBlobGranuleMergeBoundaryValue(const ValueRef& value); const Key blobGranuleHistoryKeyFor(KeyRangeRef const& range, Version version); std::pair decodeBlobGranuleHistoryKey(KeyRef const& key); @@ -679,12 +702,11 @@ UID decodeBlobWorkerListKey(KeyRef const& key); const Value blobWorkerListValue(BlobWorkerInterface const& interface); BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value); -// State for the tenant map -extern const KeyRangeRef tenantMapKeys; -extern const KeyRef tenantMapPrefix; -extern const KeyRef tenantMapPrivatePrefix; -extern const KeyRef tenantLastIdKey; -extern const KeyRef tenantDataPrefixKey; +// Storage quota per tenant +// "\xff/storageQuota/[[tenantName]]" := "[[quota]]" +extern const KeyRangeRef storageQuotaKeys; +extern const KeyRef storageQuotaPrefix; +Key storageQuotaKey(StringRef tenantName); #pragma clang diagnostic pop diff --git a/fdbclient/include/fdbclient/TagThrottle.actor.h b/fdbclient/include/fdbclient/TagThrottle.actor.h index 020fcea568..cf4ef8fd16 100644 --- a/fdbclient/include/fdbclient/TagThrottle.actor.h +++ b/fdbclient/include/fdbclient/TagThrottle.actor.h @@ -264,9 +264,9 @@ Future getValidAutoEnabled(Reference tr) { tr->reset(); wait(delay(CLIENT_KNOBS->DEFAULT_BACKOFF)); continue; - } else if (value.get() == LiteralStringRef("1")) { + } else if (value.get() == "1"_sr) { result = true; - } else if (value.get() == LiteralStringRef("0")) { + } else if (value.get() == "0"_sr) { result = false; } else { TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue").detail("Value", value.get()); @@ -331,8 +331,7 @@ getThrottledTags(Reference db, int limit, ContainsRecommended containsRecomm template void signalThrottleChange(Reference tr) { - tr->atomicOp( - tagThrottleSignalKey, LiteralStringRef("XXXXXXXXXX\x00\x00\x00\x00"), MutationRef::SetVersionstampedValue); + tr->atomicOp(tagThrottleSignalKey, "XXXXXXXXXX\x00\x00\x00\x00"_sr, MutationRef::SetVersionstampedValue); } ACTOR template @@ -583,9 +582,8 @@ Future enableAuto(Reference db, bool enabled) { state typename DB::TransactionT::template FutureT> valueF = tr->get(tagThrottleAutoEnabledKey); Optional value = wait(safeThreadFutureToFuture(valueF)); - if (!value.present() || (enabled && value.get() != LiteralStringRef("1")) || - (!enabled && value.get() != LiteralStringRef("0"))) { - tr->set(tagThrottleAutoEnabledKey, LiteralStringRef(enabled ? "1" : "0")); + if (!value.present() || (enabled && value.get() != "1"_sr) || (!enabled && value.get() != "0"_sr)) { + tr->set(tagThrottleAutoEnabledKey, enabled ? "1"_sr : "0"_sr); signalThrottleChange(tr); wait(safeThreadFutureToFuture(tr->commit())); @@ -599,10 +597,8 @@ Future enableAuto(Reference db, bool enabled) { class TagQuotaValue { public: - double reservedReadQuota{ 0.0 }; - double totalReadQuota{ 0.0 }; - double reservedWriteQuota{ 0.0 }; - double totalWriteQuota{ 0.0 }; + double reservedQuota{ 0.0 }; + double totalQuota{ 0.0 }; bool isValid() const; Value toValue() const; static TagQuotaValue fromValue(ValueRef); @@ -611,17 +607,10 @@ public: Key getTagQuotaKey(TransactionTagRef); template -void setTagQuota(Reference tr, - TransactionTagRef tag, - double reservedReadQuota, - double totalReadQuota, - double reservedWriteQuota, - double totalWriteQuota) { +void setTagQuota(Reference tr, TransactionTagRef tag, double reservedQuota, double totalQuota) { TagQuotaValue tagQuotaValue; - tagQuotaValue.reservedReadQuota = reservedReadQuota; - tagQuotaValue.totalReadQuota = totalReadQuota; - tagQuotaValue.reservedWriteQuota = reservedWriteQuota; - tagQuotaValue.totalWriteQuota = totalWriteQuota; + tagQuotaValue.reservedQuota = reservedQuota; + tagQuotaValue.totalQuota = totalQuota; if (!tagQuotaValue.isValid()) { throw invalid_throttle_quota_value(); } diff --git a/fdbclient/include/fdbclient/TaskBucket.h b/fdbclient/include/fdbclient/TaskBucket.h index b7e6091d0f..b0b7a2bc51 100644 --- a/fdbclient/include/fdbclient/TaskBucket.h +++ b/fdbclient/include/fdbclient/TaskBucket.h @@ -115,7 +115,7 @@ public: }; struct ReservedTaskParams { - static TaskParam scheduledVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam scheduledVersion() { return __FUNCTION__sr; } }; class FutureBucket; @@ -480,7 +480,8 @@ struct TaskFuncBase : IDispatched, std::func }; #define REGISTER_TASKFUNC(TaskFunc) REGISTER_FACTORY(TaskFuncBase, TaskFunc, name) #define REGISTER_TASKFUNC_ALIAS(TaskFunc, Alias) \ - REGISTER_DISPATCHED_ALIAS(TaskFunc, Alias, TaskFunc::name, LiteralStringRef(#Alias)) + REGISTER_DISPATCHED_ALIAS( \ + TaskFunc, Alias, TaskFunc::name, StringRef(reinterpret_cast(#Alias), sizeof(#Alias) - 1)) struct TaskCompletionKey { Future get(Reference tr, Reference taskBucket); diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index baac9f4543..47d27a0f72 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -23,58 +23,191 @@ #pragma once #include "fdbclient/FDBTypes.h" +#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/VersionedMap.h" +#include "fdbclient/KeyBackedTypes.h" +#include "fdbrpc/TenantInfo.h" +#include "flow/BooleanParam.h" #include "flow/flat_buffers.h" typedef StringRef TenantNameRef; typedef Standalone TenantName; +typedef StringRef TenantGroupNameRef; +typedef Standalone TenantGroupName; + +// Represents the various states that a tenant could be in. +// In a standalone cluster, a tenant should only ever be in the READY state. +// In a metacluster, a tenant on the management cluster could be in the other states while changes are applied to the +// data cluster. +// +// REGISTERING - the tenant has been created on the management cluster and is being created on the data cluster +// READY - the tenant has been created on both clusters, is active, and is consistent between the two clusters +// REMOVING - the tenant has been marked for removal and is being removed on the data cluster +// UPDATING_CONFIGURATION - the tenant configuration has changed on the management cluster and is being applied to the +// data cluster +// RENAMING_FROM - the tenant is being renamed to a new name and is awaiting the rename to complete on the data cluster +// RENAMING_TO - the tenant is being created as a rename from an existing tenant and is awaiting the rename to complete +// on the data cluster +// ERROR - the tenant is in an error state +// +// A tenant in any configuration is allowed to be removed. Only tenants in the READY or UPDATING_CONFIGURATION phases +// can have their configuration updated. A tenant must not exist or be in the REGISTERING phase to be created. To be +// renamed, a tenant must be in the READY or RENAMING_FROM state. In the latter case, the rename destination must match +// the original rename attempt. +// +// If an operation fails and the tenant is left in a non-ready state, re-running the same operation is legal. If +// successful, the tenant will return to the READY state. +enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, RENAMING_FROM, RENAMING_TO, ERROR }; + +// Represents the lock state the tenant could be in. +// Can be used in conjunction with the other tenant states above. +enum class TenantLockState { UNLOCKED, READ_ONLY, LOCKED }; + +constexpr int TENANT_PREFIX_SIZE = sizeof(int64_t); + +FDB_DECLARE_BOOLEAN_PARAM(EnforceValidTenantId); struct TenantMapEntry { constexpr static FileIdentifier file_identifier = 12247338; static Key idToPrefix(int64_t id); - static int64_t prefixToId(KeyRef prefix); + static int64_t prefixToId(KeyRef prefix, EnforceValidTenantId enforceTenantId = EnforceValidTenantId::True); - int64_t id; + static std::string tenantStateToString(TenantState tenantState); + static TenantState stringToTenantState(std::string stateStr); + + static std::string tenantLockStateToString(TenantLockState tenantState); + static TenantLockState stringToTenantLockState(std::string stateStr); + + int64_t id = -1; Key prefix; + TenantState tenantState = TenantState::READY; + TenantLockState tenantLockState = TenantLockState::UNLOCKED; + Optional tenantGroup; + bool encrypted = false; + Optional assignedCluster; + int64_t configurationSequenceNum = 0; + Optional renamePair; - constexpr static int ROOT_PREFIX_SIZE = sizeof(id); + // Can be set to an error string if the tenant is in the ERROR state + std::string error; -private: - void initPrefix(KeyRef subspace); + constexpr static int PREFIX_SIZE = sizeof(id); -public: TenantMapEntry(); - TenantMapEntry(int64_t id, KeyRef subspace); + TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted); + TenantMapEntry(int64_t id, TenantState tenantState, Optional tenantGroup, bool encrypted); - Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion(ProtocolVersion::withTenants())); } + void setId(int64_t id); + std::string toJson() const; + bool matchesConfiguration(TenantMapEntry const& other) const; + void configure(Standalone parameter, Optional value); + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } static TenantMapEntry decode(ValueRef const& value) { - TenantMapEntry entry; - ObjectReader reader(value.begin(), IncludeVersion(ProtocolVersion::withTenants())); - reader.deserialize(entry); - return entry; + return ObjectReader::fromStringRef(value, IncludeVersion()); } template void serialize(Ar& ar) { - KeyRef subspace; - if (ar.isDeserializing) { - serializer(ar, id, subspace); + serializer(ar, + id, + tenantState, + tenantLockState, + tenantGroup, + encrypted, + assignedCluster, + configurationSequenceNum, + renamePair, + error); + if constexpr (Ar::isDeserializing) { if (id >= 0) { - initPrefix(subspace); + prefix = idToPrefix(id); } - } else { - ASSERT(prefix.size() >= 8 || (prefix.empty() && id == -1)); - if (!prefix.empty()) { - subspace = prefix.substr(0, prefix.size() - 8); - } - serializer(ar, id, subspace); + ASSERT(tenantState >= TenantState::REGISTERING && tenantState <= TenantState::ERROR); } } }; -typedef VersionedMap TenantMap; -typedef VersionedMap TenantPrefixIndex; +struct TenantGroupEntry { + constexpr static FileIdentifier file_identifier = 10764222; -#endif \ No newline at end of file + Optional assignedCluster; + + TenantGroupEntry() = default; + TenantGroupEntry(Optional assignedCluster) : assignedCluster(assignedCluster) {} + + json_spirit::mObject toJson() const; + + Value encode() { return ObjectWriter::toValue(*this, IncludeVersion()); } + static TenantGroupEntry decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + + template + void serialize(Ar& ar) { + serializer(ar, assignedCluster); + } +}; + +struct TenantTombstoneCleanupData { + constexpr static FileIdentifier file_identifier = 3291339; + + // All tombstones have been erased up to and including this id. + // We should not generate new tombstones at IDs equal to or older than this. + int64_t tombstonesErasedThrough = -1; + + // The version at which we will next erase tombstones. + Version nextTombstoneEraseVersion = invalidVersion; + + // When we reach the nextTombstoneEraseVersion, we will erase tombstones up through this ID. + int64_t nextTombstoneEraseId = -1; + + template + void serialize(Ar& ar) { + serializer(ar, tombstonesErasedThrough, nextTombstoneEraseVersion, nextTombstoneEraseId); + } +}; + +struct TenantMetadataSpecification { + Key subspace; + + KeyBackedObjectMap tenantMap; + KeyBackedMap tenantIdIndex; + KeyBackedProperty lastTenantId; + KeyBackedBinaryValue tenantCount; + KeyBackedSet tenantTombstones; + KeyBackedObjectProperty tombstoneCleanupData; + KeyBackedSet tenantGroupTenantIndex; + KeyBackedObjectMap tenantGroupMap; + + TenantMetadataSpecification(KeyRef prefix) + : subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()), + tenantIdIndex(subspace.withSuffix("idIndex/"_sr)), lastTenantId(subspace.withSuffix("lastId"_sr)), + tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)), + tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()), + tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)), + tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {} +}; + +struct TenantMetadata { + static TenantMetadataSpecification& instance(); + + static inline auto& subspace() { return instance().subspace; } + static inline auto& tenantMap() { return instance().tenantMap; } + static inline auto& tenantIdIndex() { return instance().tenantIdIndex; } + static inline auto& lastTenantId() { return instance().lastTenantId; } + static inline auto& tenantCount() { return instance().tenantCount; } + static inline auto& tenantTombstones() { return instance().tenantTombstones; } + static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; } + static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; } + static inline auto& tenantGroupMap() { return instance().tenantGroupMap; } + + static Key tenantMapPrivatePrefix(); +}; + +typedef VersionedMap TenantMap; +class TenantPrefixIndex : public VersionedMap, public ReferenceCounted {}; + +#endif diff --git a/fdbclient/include/fdbclient/TenantEntryCache.actor.h b/fdbclient/include/fdbclient/TenantEntryCache.actor.h new file mode 100644 index 0000000000..cd35c5a985 --- /dev/null +++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h @@ -0,0 +1,390 @@ +/* + * TenantEntryCache.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANTENTRYCACHE_ACTOR_G_H) +#define FDBCLIENT_TENANTENTRYCACHE_ACTOR_G_H +#include "fdbclient/TenantEntryCache.actor.g.h" +#elif !defined(FDBCLIENT_TENANTENTRYCACHE_ACTOR_H) +#define FDBCLIENT_TENANTENTRYCACHE_ACTOR_H + +#pragma once + +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/RunTransaction.actor.h" +#include "fdbclient/Tenant.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/Knobs.h" +#include "fdbrpc/TenantName.h" +#include "flow/IndexedSet.h" + +#include +#include + +#include "flow/actorcompiler.h" // has to be last include + +using TenantNameEntryPair = std::pair; +using TenantNameEntryPairVec = std::vector; + +enum class TenantEntryCacheRefreshReason { INIT = 1, PERIODIC_TASK = 2, CACHE_MISS = 3, REMOVE_ENTRY = 4 }; +enum class TenantEntryCacheRefreshMode { PERIODIC_TASK = 1, NONE = 2 }; + +template +struct TenantEntryCachePayload { + TenantName name; + TenantMapEntry entry; + // Custom client payload + T payload; +}; + +template +using TenantEntryCachePayloadFunc = std::function(const TenantName&, const TenantMapEntry&)>; + +// In-memory cache for TenantEntryMap objects. It supports three indices: +// 1. Lookup by 'TenantId' +// 2. Lookup by 'TenantPrefix' +// 3. Lookup by 'TenantName' +// +// TODO: +// ---- +// The cache allows user to construct the 'cached object' by supplying a callback. The cache implements a periodic +// refresh mechanism, polling underlying database for updates (add/remove tenants), in future we might want to implement +// database range-watch to monitor such updates + +template +class TenantEntryCache : public ReferenceCounted>, NonCopyable { +private: + UID uid; + Database db; + TenantEntryCachePayloadFunc createPayloadFunc; + TenantEntryCacheRefreshMode refreshMode; + + Future refresher; + Map> mapByTenantId; + Map> mapByTenantName; + + CounterCollection metrics; + Counter hits; + Counter misses; + Counter refreshByCacheInit; + Counter refreshByCacheMiss; + Counter numRefreshes; + + ACTOR static Future getTenantList(Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + KeyBackedRangeResult> tenantList = + wait(TenantMetadata::tenantMap().getRange( + tr, Optional(), Optional(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + ASSERT(tenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER && !tenantList.more); + + TraceEvent(SevDebug, "TenantEntryCacheGetTenantList").detail("Count", tenantList.results.size()); + + return tenantList.results; + } + + static void updateCacheRefreshMetrics(TenantEntryCache* cache, TenantEntryCacheRefreshReason reason) { + if (reason == TenantEntryCacheRefreshReason::INIT) { + cache->refreshByCacheInit += 1; + } else if (reason == TenantEntryCacheRefreshReason::CACHE_MISS) { + cache->refreshByCacheMiss += 1; + } + + cache->numRefreshes += 1; + } + + ACTOR static Future refreshImpl(TenantEntryCache* cache, TenantEntryCacheRefreshReason reason) { + TraceEvent(SevDebug, "TenantEntryCacheRefreshStart", cache->id()).detail("Reason", static_cast(reason)); + + state Reference tr = cache->getDatabase()->createTransaction(); + loop { + try { + state TenantNameEntryPairVec tenantList = wait(getTenantList(tr)); + + // Refresh cache entries reflecting the latest database state + cache->clear(); + for (auto& tenant : tenantList) { + cache->put(tenant); + } + + updateCacheRefreshMetrics(cache, reason); + break; + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled) { + TraceEvent(SevInfo, "TenantEntryCacheRefreshError", cache->id()) + .errorUnsuppressed(e) + .suppressFor(1.0); + } + wait(tr->onError(e)); + } + } + + TraceEvent(SevDebug, "TenantEntryCacheRefreshEnd", cache->id()).detail("Reason", static_cast(reason)); + + return Void(); + } + + ACTOR static Future>> getByIdImpl(TenantEntryCache* cache, + int64_t tenantId) { + Optional> ret = cache->lookupById(tenantId); + if (ret.present()) { + cache->hits += 1; + return ret; + } + + TraceEvent(SevInfo, "TenantEntryCacheGetByIdRefresh").detail("TenantId", tenantId); + + // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. + // TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any + // existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare + wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + + cache->misses += 1; + return cache->lookupById(tenantId); + } + + ACTOR static Future>> getByNameImpl(TenantEntryCache* cache, + TenantName name) { + Optional> ret = cache->lookupByName(name); + if (ret.present()) { + cache->hits += 1; + return ret; + } + + TraceEvent("TenantEntryCacheGetByNameRefresh").detail("TenantName", name); + + // Entry not found. Refresh cacheEntries by scanning underlying KeyRange. + // TODO: Cache will implement a "KeyRange" watch, monitoring notification when a new entry gets added or any + // existing entry gets updated within the KeyRange of interest. Hence, misses would be very rare + wait(refreshImpl(cache, TenantEntryCacheRefreshReason::CACHE_MISS)); + + cache->misses += 1; + return cache->lookupByName(name); + } + + Optional> lookupById(int64_t tenantId) { + Optional> ret; + auto itr = mapByTenantId.find(tenantId); + if (itr == mapByTenantId.end()) { + return ret; + } + + return itr->value; + } + + Optional> lookupByName(TenantName name) { + Optional> ret; + auto itr = mapByTenantName.find(name); + if (itr == mapByTenantName.end()) { + return ret; + } + + return itr->value; + } + + Future refresh(TenantEntryCacheRefreshReason reason) { return refreshImpl(this, reason); } + + static TenantEntryCachePayload defaultCreatePayload(const TenantName& name, const TenantMapEntry& entry) { + TenantEntryCachePayload payload; + payload.name = name; + payload.entry = entry; + + return payload; + } + + Future removeEntryInt(Optional tenantId, + Optional tenantPrefix, + Optional tenantName, + bool refreshCache) { + typename Map>::iterator itrId; + typename Map>::iterator itrName; + + if (tenantId.present() || tenantPrefix.present()) { + // Ensure either tenantId OR tenantPrefix is valid (but not both) + ASSERT(tenantId.present() != tenantPrefix.present()); + ASSERT(!tenantName.present()); + + int64_t tId = tenantId.present() ? tenantId.get() : TenantMapEntry::prefixToId(tenantPrefix.get()); + TraceEvent("TenantEntryCacheRemoveEntry").detail("Id", tId); + itrId = mapByTenantId.find(tId); + if (itrId == mapByTenantId.end()) { + return Void(); + } + // Ensure byId and byName cache are in-sync + itrName = mapByTenantName.find(itrId->value.name); + ASSERT(itrName != mapByTenantName.end()); + } else if (tenantName.present()) { + ASSERT(!tenantId.present() && !tenantPrefix.present()); + + TraceEvent("TenantEntryCacheRemoveEntry").detail("Name", tenantName.get()); + itrName = mapByTenantName.find(tenantName.get()); + if (itrName == mapByTenantName.end()) { + return Void(); + } + // Ensure byId and byName cache are in-sync + itrId = mapByTenantId.find(itrName->value.entry.id); + ASSERT(itrId != mapByTenantId.end()); + } else { + // Invalid input, one of: tenantId, tenantPrefix or tenantName needs to be valid. + throw operation_failed(); + } + + ASSERT(itrId != mapByTenantId.end() && itrName != mapByTenantName.end()); + + TraceEvent("TenantEntryCacheRemoveEntry") + .detail("Id", itrId->key) + .detail("Prefix", itrId->value.entry.prefix) + .detail("Name", itrName->key); + + mapByTenantId.erase(itrId); + mapByTenantName.erase(itrName); + + if (refreshCache) { + return refreshImpl(this, TenantEntryCacheRefreshReason::REMOVE_ENTRY); + } + + return Void(); + } + +public: + TenantEntryCache(Database db) + : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(defaultCreatePayload), + refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), metrics("TenantEntryCacheMetrics", uid.toString()), + hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), + refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreatedDefaultFunc", uid); + } + + TenantEntryCache(Database db, TenantEntryCachePayloadFunc fn) + : uid(deterministicRandom()->randomUniqueID()), db(db), createPayloadFunc(fn), + refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), metrics("TenantEntryCacheMetrics", uid.toString()), + hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), + refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreated", uid); + } + + TenantEntryCache(Database db, UID id, TenantEntryCachePayloadFunc fn) + : uid(id), db(db), createPayloadFunc(fn), refreshMode(TenantEntryCacheRefreshMode::PERIODIC_TASK), + metrics("TenantEntryCacheMetrics", uid.toString()), hits("TenantEntryCacheHits", metrics), + misses("TenantEntryCacheMisses", metrics), refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreated", uid); + } + + TenantEntryCache(Database db, UID id, TenantEntryCachePayloadFunc fn, TenantEntryCacheRefreshMode mode) + : uid(id), db(db), createPayloadFunc(fn), refreshMode(mode), metrics("TenantEntryCacheMetrics", uid.toString()), + hits("TenantEntryCacheHits", metrics), misses("TenantEntryCacheMisses", metrics), + refreshByCacheInit("TenantEntryCacheRefreshInit", metrics), + refreshByCacheMiss("TenantEntryCacheRefreshMiss", metrics), + numRefreshes("TenantEntryCacheNumRefreshes", metrics) { + TraceEvent("TenantEntryCacheCreated", uid); + } + + Future init() { + TraceEvent("TenantEntryCacheInit", uid); + + Future f = refreshImpl(this, TenantEntryCacheRefreshReason::INIT); + + // Launch reaper task to periodically refresh cache by scanning database KeyRange + TenantEntryCacheRefreshReason reason = TenantEntryCacheRefreshReason::PERIODIC_TASK; + if (refreshMode == TenantEntryCacheRefreshMode::PERIODIC_TASK) { + refresher = recurringAsync([&, reason]() { return refresh(reason); }, + CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* interval */ + true, /* absoluteIntervalDelay */ + CLIENT_KNOBS->TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, /* intialDelay */ + TaskPriority::Worker); + } + + return f; + } + + Database getDatabase() const { return db; } + UID id() const { return uid; } + + void clear() { + mapByTenantId.clear(); + mapByTenantName.clear(); + } + + Future removeEntryById(int64_t tenantId, bool refreshCache = false) { + return removeEntryInt(tenantId, Optional(), Optional(), refreshCache); + } + Future removeEntryByPrefix(KeyRef tenantPrefix, bool refreshCache = false) { + return removeEntryInt(Optional(), tenantPrefix, Optional(), refreshCache); + } + Future removeEntryByName(TenantName tenantName, bool refreshCache = false) { + return removeEntryInt(Optional(), Optional(), tenantName, refreshCache); + } + + void put(const TenantNameEntryPair& pair) { + TenantEntryCachePayload payload = createPayloadFunc(pair.first, pair.second); + auto idItr = mapByTenantId.find(pair.second.id); + auto nameItr = mapByTenantName.find(pair.first); + + Optional existingName; + Optional existingId; + if (nameItr != mapByTenantName.end()) { + existingId = nameItr->value.entry.id; + mapByTenantId.erase(nameItr->value.entry.id); + } + if (idItr != mapByTenantId.end()) { + existingName = idItr->value.name; + mapByTenantName.erase(idItr->value.name); + } + + mapByTenantId[pair.second.id] = payload; + mapByTenantName[pair.first] = payload; + + TraceEvent("TenantEntryCachePut") + .detail("TenantName", pair.first) + .detail("TenantNameExisting", existingName) + .detail("TenantID", pair.second.id) + .detail("TenantIDExisting", existingId) + .detail("TenantPrefix", pair.second.prefix); + + CODE_PROBE(idItr == mapByTenantId.end() && nameItr == mapByTenantName.end(), "TenantCache new entry"); + CODE_PROBE(idItr != mapByTenantId.end() && nameItr == mapByTenantName.end(), "TenantCache entry name updated"); + CODE_PROBE(idItr == mapByTenantId.end() && nameItr != mapByTenantName.end(), "TenantCache entry id updated"); + CODE_PROBE(idItr != mapByTenantId.end() && nameItr != mapByTenantName.end(), + "TenantCache entry id and name updated"); + } + + Future>> getById(int64_t tenantId) { return getByIdImpl(this, tenantId); } + Future>> getByPrefix(KeyRef prefix) { + int64_t id = TenantMapEntry::prefixToId(prefix); + return getByIdImpl(this, id); + } + Future>> getByName(TenantName name) { return getByNameImpl(this, name); } + + // Counter access APIs + Counter::Value numCacheRefreshes() const { return numRefreshes.getValue(); } + Counter::Value numRefreshByMisses() const { return refreshByCacheMiss.getValue(); } + Counter::Value numRefreshByInit() const { return refreshByCacheInit.getValue(); } +}; + +#include "flow/unactorcompiler.h" +#endif // FDBCLIENT_TENANTENTRYCACHE_ACTOR_H \ No newline at end of file diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index 680edccc05..6e91c8fb90 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -19,7 +19,9 @@ */ #pragma once +#include "fdbclient/ClientBooleanParams.h" #include "flow/IRandom.h" +#include "flow/ThreadHelper.actor.h" #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H) #define FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H #include "fdbclient/TenantManagement.actor.g.h" @@ -29,19 +31,16 @@ #include #include #include "fdbclient/GenericTransactionHelper.h" +#include "fdbclient/Metacluster.h" #include "fdbclient/SystemData.h" #include "flow/actorcompiler.h" // has to be last include namespace TenantAPI { -ACTOR template + +template Future> tryGetTenantTransaction(Transaction tr, TenantName name) { - state Key tenantMapKey = name.withPrefix(tenantMapPrefix); - tr->setOption(FDBTransactionOptions::RAW_ACCESS); - - state typename transaction_future_type>::type tenantFuture = tr->get(tenantMapKey); - Optional val = wait(safeThreadFutureToFuture(tenantFuture)); - return val.map([](Optional v) { return TenantMapEntry::decode(v.get()); }); + return TenantMetadata::tenantMap().get(tr, name); } ACTOR template @@ -80,67 +79,133 @@ Future getTenant(Reference db, TenantName name) { return entry.get(); } -// Creates a tenant with the given name. If the tenant already exists, an empty optional will be returned. -// The caller must enforce that the tenant ID be unique from all current and past tenants, and it must also be unique -// from all other tenants created in the same transaction. ACTOR template -Future> createTenantTransaction(Transaction tr, TenantNameRef name, int64_t tenantId) { - state Key tenantMapKey = name.withPrefix(tenantMapPrefix); +Future getClusterType(Transaction tr) { + Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + return metaclusterRegistration.present() ? metaclusterRegistration.get().clusterType : ClusterType::STANDALONE; +} + +ACTOR template +Future checkTenantMode(Transaction tr, ClusterType expectedClusterType) { + state typename transaction_future_type>::type tenantModeFuture = + tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr)); + + state ClusterType actualClusterType = wait(getClusterType(tr)); + Optional tenantModeValue = wait(safeThreadFutureToFuture(tenantModeFuture)); + + TenantMode tenantMode = TenantMode::fromValue(tenantModeValue.castTo()); + if (actualClusterType != expectedClusterType) { + throw invalid_metacluster_operation(); + } else if (actualClusterType == ClusterType::STANDALONE && tenantMode == TenantMode::DISABLED) { + throw tenants_disabled(); + } + + return Void(); +} + +TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode); + +// Returns true if the specified ID has already been deleted and false if not. If the ID is old enough +// that we no longer keep tombstones for it, an error is thrown. +ACTOR template +Future checkTombstone(Transaction tr, int64_t id) { + state Future tombstoneFuture = TenantMetadata::tenantTombstones().exists(tr, id); + + // If we are trying to create a tenant older than the oldest tombstones we still maintain, then we fail it + // with an error. + Optional tombstoneCleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr)); + if (tombstoneCleanupData.present() && tombstoneCleanupData.get().tombstonesErasedThrough >= id) { + throw tenant_creation_permanently_failed(); + } + + state bool hasTombstone = wait(tombstoneFuture); + return hasTombstone; +} + +// Creates a tenant with the given name. If the tenant already exists, the boolean return parameter will be false +// and the existing entry will be returned. If the tenant cannot be created, then the optional will be empty. +ACTOR template +Future, bool>> createTenantTransaction( + Transaction tr, + TenantNameRef name, + TenantMapEntry tenantEntry, + ClusterType clusterType = ClusterType::STANDALONE) { + + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); + ASSERT(tenantEntry.id >= 0); if (name.startsWith("\xff"_sr)) { throw invalid_tenant_name(); } + if (tenantEntry.tenantGroup.present() && tenantEntry.tenantGroup.get().startsWith("\xff"_sr)) { + throw invalid_tenant_group_name(); + } tr->setOption(FDBTransactionOptions::RAW_ACCESS); - state Future> tenantEntryFuture = tryGetTenantTransaction(tr, name); - state typename transaction_future_type>::type tenantDataPrefixFuture = - tr->get(tenantDataPrefixKey); - state typename transaction_future_type>::type tenantModeFuture = - tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr)); - - Optional tenantMode = wait(safeThreadFutureToFuture(tenantModeFuture)); - - if (!tenantMode.present() || tenantMode.get() == StringRef(format("%d", TenantMode::DISABLED))) { - throw tenants_disabled(); + state Future> existingEntryFuture = tryGetTenantTransaction(tr, name); + state Future tenantModeCheck = checkTenantMode(tr, clusterType); + state Future tombstoneFuture = + (clusterType == ClusterType::STANDALONE) ? false : checkTombstone(tr, tenantEntry.id); + state Future> existingTenantGroupEntryFuture; + if (tenantEntry.tenantGroup.present()) { + existingTenantGroupEntryFuture = TenantMetadata::tenantGroupMap().get(tr, tenantEntry.tenantGroup.get()); } - Optional tenantEntry = wait(tenantEntryFuture); - if (tenantEntry.present()) { - return std::make_pair(tenantEntry.get(), false); + wait(tenantModeCheck); + Optional existingEntry = wait(existingEntryFuture); + if (existingEntry.present()) { + return std::make_pair(existingEntry.get(), false); } - Optional tenantDataPrefix = wait(safeThreadFutureToFuture(tenantDataPrefixFuture)); - if (tenantDataPrefix.present() && - tenantDataPrefix.get().size() + TenantMapEntry::ROOT_PREFIX_SIZE > CLIENT_KNOBS->TENANT_PREFIX_SIZE_LIMIT) { - TraceEvent(SevWarnAlways, "TenantPrefixTooLarge") - .detail("TenantSubspace", tenantDataPrefix.get()) - .detail("TenantSubspaceLength", tenantDataPrefix.get().size()) - .detail("RootPrefixLength", TenantMapEntry::ROOT_PREFIX_SIZE) - .detail("MaxTenantPrefixSize", CLIENT_KNOBS->TENANT_PREFIX_SIZE_LIMIT); - - throw client_invalid_operation(); + state bool hasTombstone = wait(tombstoneFuture); + if (hasTombstone) { + return std::make_pair(Optional(), false); } - state TenantMapEntry newTenant(tenantId, tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr); - state typename transaction_future_type::type prefixRangeFuture = - tr->getRange(prefixRange(newTenant.prefix), 1); + tr->getRange(prefixRange(tenantEntry.prefix), 1); + RangeResult contents = wait(safeThreadFutureToFuture(prefixRangeFuture)); if (!contents.empty()) { throw tenant_prefix_allocator_conflict(); } - tr->set(tenantMapKey, newTenant.encode()); + tenantEntry.tenantState = TenantState::READY; + tenantEntry.assignedCluster = Optional(); - return std::make_pair(newTenant, true); + TenantMetadata::tenantMap().set(tr, name, tenantEntry); + TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name); + + if (tenantEntry.tenantGroup.present()) { + TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name)); + + // Create the tenant group associated with this tenant if it doesn't already exist + Optional existingTenantGroup = wait(existingTenantGroupEntryFuture); + if (!existingTenantGroup.present()) { + TenantMetadata::tenantGroupMap().set(tr, tenantEntry.tenantGroup.get(), TenantGroupEntry()); + } + } + + // This is idempotent because we only add an entry to the tenant map if it isn't already there + TenantMetadata::tenantCount().atomicOp(tr, 1, MutationRef::AddValue); + + // Read the tenant count after incrementing the counter so that simultaneous attempts to create + // tenants in the same transaction are properly reflected. + int64_t tenantCount = wait(TenantMetadata::tenantCount().getD(tr, Snapshot::False, 0)); + if (tenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + return std::make_pair(tenantEntry, true); } ACTOR template Future getNextTenantId(Transaction tr) { - state typename transaction_future_type>::type lastIdFuture = tr->get(tenantLastIdKey); - Optional lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture)); - int64_t tenantId = lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0; + Optional lastId = wait(TenantMetadata::lastTenantId().get(tr)); + int64_t tenantId = lastId.orDefault(-1) + 1; if (BUGGIFY) { tenantId += deterministicRandom()->randomSkewedUInt32(1, 1e9); } @@ -148,37 +213,56 @@ Future getNextTenantId(Transaction tr) { } ACTOR template -Future createTenant(Reference db, TenantName name) { +Future> createTenant(Reference db, + TenantName name, + TenantMapEntry tenantEntry = TenantMapEntry(), + ClusterType clusterType = ClusterType::STANDALONE) { state Reference tr = db->createTransaction(); - state bool firstTry = true; + state bool checkExistence = clusterType != ClusterType::METACLUSTER_DATA; + state bool generateTenantId = tenantEntry.id < 0; + + ASSERT(clusterType == ClusterType::STANDALONE || !generateTenantId); + loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Future tenantIdFuture = getNextTenantId(tr); + state Future tenantIdFuture; + if (generateTenantId) { + tenantIdFuture = getNextTenantId(tr); + } - if (firstTry) { + if (checkExistence) { Optional entry = wait(tryGetTenantTransaction(tr, name)); if (entry.present()) { throw tenant_already_exists(); } - firstTry = false; + checkExistence = false; } - int64_t tenantId = wait(tenantIdFuture); - tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(tenantId)); - state std::pair newTenant = wait(createTenantTransaction(tr, name, tenantId)); + if (generateTenantId) { + int64_t tenantId = wait(tenantIdFuture); + tenantEntry.setId(tenantId); + TenantMetadata::lastTenantId().set(tr, tenantId); + } - wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + state std::pair, bool> newTenant = + wait(createTenantTransaction(tr, name, tenantEntry, clusterType)); - TraceEvent("CreatedTenant") - .detail("Tenant", name) - .detail("TenantId", newTenant.first.id) - .detail("Prefix", newTenant.first.prefix) - .detail("Version", tr->getCommittedVersion()); + if (newTenant.second) { + ASSERT(newTenant.first.present()); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + + TraceEvent("CreatedTenant") + .detail("Tenant", name) + .detail("TenantId", newTenant.first.get().id) + .detail("Prefix", newTenant.first.get().prefix) + .detail("TenantGroup", tenantEntry.tenantGroup) + .detail("Version", tr->getCommittedVersion()); + } return newTenant.first; } catch (Error& e) { @@ -188,48 +272,132 @@ Future createTenant(Reference db, TenantName name) { } ACTOR template -Future deleteTenantTransaction(Transaction tr, TenantNameRef name) { - state Key tenantMapKey = name.withPrefix(tenantMapPrefix); +Future markTenantTombstones(Transaction tr, int64_t tenantId) { + // In data clusters, we store a tombstone + state Future> latestTombstoneFuture = + TenantMetadata::tenantTombstones().getRange(tr, {}, {}, 1, Snapshot::False, Reverse::True); + state Optional cleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr)); + state Version transactionReadVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); + + // If it has been long enough since we last cleaned up the tenant tombstones, we do that first + if (!cleanupData.present() || cleanupData.get().nextTombstoneEraseVersion <= transactionReadVersion) { + state int64_t deleteThroughId = cleanupData.present() ? cleanupData.get().nextTombstoneEraseId : -1; + // Delete all tombstones up through the one currently marked in the cleanup data + if (deleteThroughId >= 0) { + TenantMetadata::tenantTombstones().erase(tr, 0, deleteThroughId + 1); + } + + KeyBackedRangeResult latestTombstone = wait(latestTombstoneFuture); + int64_t nextDeleteThroughId = std::max(deleteThroughId, tenantId); + if (!latestTombstone.results.empty()) { + nextDeleteThroughId = std::max(nextDeleteThroughId, latestTombstone.results[0]); + } + + // The next cleanup will happen at or after TENANT_TOMBSTONE_CLEANUP_INTERVAL seconds have elapsed and + // will clean up tombstones through the most recently allocated ID. + TenantTombstoneCleanupData updatedCleanupData; + updatedCleanupData.tombstonesErasedThrough = deleteThroughId; + updatedCleanupData.nextTombstoneEraseId = nextDeleteThroughId; + updatedCleanupData.nextTombstoneEraseVersion = + transactionReadVersion + + CLIENT_KNOBS->TENANT_TOMBSTONE_CLEANUP_INTERVAL * CLIENT_KNOBS->VERSIONS_PER_SECOND; + + TenantMetadata::tombstoneCleanupData().set(tr, updatedCleanupData); + + // If the tenant being deleted is within the tombstone window, record the tombstone + if (tenantId > updatedCleanupData.tombstonesErasedThrough) { + TenantMetadata::tenantTombstones().insert(tr, tenantId); + } + } else if (tenantId > cleanupData.get().tombstonesErasedThrough) { + // If the tenant being deleted is within the tombstone window, record the tombstone + TenantMetadata::tenantTombstones().insert(tr, tenantId); + } + return Void(); +} + +// Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same +// ID. If no matching tenant is found, this function returns without deleting anything. This behavior allows the +// function to be used idempotently: if the transaction is retried after having succeeded, it will see that the tenant +// is absent (or optionally created with a new ID) and do nothing. +ACTOR template +Future deleteTenantTransaction(Transaction tr, + TenantNameRef name, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { + ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present()); + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); tr->setOption(FDBTransactionOptions::RAW_ACCESS); - state Optional tenantEntry = wait(tryGetTenantTransaction(tr, name)); - if (!tenantEntry.present()) { - return Void(); + state Future> tenantEntryFuture = tryGetTenantTransaction(tr, name); + wait(checkTenantMode(tr, clusterType)); + + state Optional tenantEntry = wait(tenantEntryFuture); + if (tenantEntry.present() && (!tenantId.present() || tenantEntry.get().id == tenantId.get())) { + state typename transaction_future_type::type prefixRangeFuture = + tr->getRange(prefixRange(tenantEntry.get().prefix), 1); + + RangeResult contents = wait(safeThreadFutureToFuture(prefixRangeFuture)); + if (!contents.empty()) { + throw tenant_not_empty(); + } + + // This is idempotent because we only erase an entry from the tenant map if it is present + TenantMetadata::tenantMap().erase(tr, name); + TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id); + TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue); + + if (tenantEntry.get().tenantGroup.present()) { + TenantMetadata::tenantGroupTenantIndex().erase(tr, + Tuple::makeTuple(tenantEntry.get().tenantGroup.get(), name)); + KeyBackedSet::RangeResultType tenantsInGroup = + wait(TenantMetadata::tenantGroupTenantIndex().getRange( + tr, + Tuple::makeTuple(tenantEntry.get().tenantGroup.get()), + Tuple::makeTuple(keyAfter(tenantEntry.get().tenantGroup.get())), + 2)); + if (tenantsInGroup.results.empty() || + (tenantsInGroup.results.size() == 1 && tenantsInGroup.results[0].getString(1) == name)) { + TenantMetadata::tenantGroupMap().erase(tr, tenantEntry.get().tenantGroup.get()); + } + } } - state typename transaction_future_type::type prefixRangeFuture = - tr->getRange(prefixRange(tenantEntry.get().prefix), 1); - RangeResult contents = wait(safeThreadFutureToFuture(prefixRangeFuture)); - if (!contents.empty()) { - throw tenant_not_empty(); + if (clusterType == ClusterType::METACLUSTER_DATA) { + wait(markTenantTombstones(tr, tenantId.get())); } - tr->clear(tenantMapKey); - return Void(); } +// Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same +// ID. ACTOR template -Future deleteTenant(Reference db, TenantName name) { +Future deleteTenant(Reference db, + TenantName name, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { state Reference tr = db->createTransaction(); - state bool firstTry = true; + state bool checkExistence = clusterType == ClusterType::STANDALONE; loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - if (firstTry) { - Optional entry = wait(tryGetTenantTransaction(tr, name)); - if (!entry.present()) { - throw tenant_not_found(); + if (checkExistence) { + TenantMapEntry entry = wait(getTenantTransaction(tr, name)); + + // If an ID wasn't specified, use the current ID. This way we cannot inadvertently delete + // multiple tenants if this transaction retries. + if (!tenantId.present()) { + tenantId = entry.id; } - firstTry = false; + checkExistence = false; } - wait(deleteTenantTransaction(tr, name)); + wait(deleteTenantTransaction(tr, name, tenantId, clusterType)); wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); TraceEvent("DeletedTenant").detail("Tenant", name).detail("Version", tr->getCommittedVersion()); @@ -240,39 +408,84 @@ Future deleteTenant(Reference db, TenantName name) { } } +// This should only be called from a transaction that has already confirmed that the tenant entry +// is present. The tenantEntry should start with the existing entry and modify only those fields that need +// to be changed. This must only be called on a non-management cluster. ACTOR template -Future> listTenantsTransaction(Transaction tr, - TenantNameRef begin, - TenantNameRef end, - int limit) { - state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix); +Future configureTenantTransaction(Transaction tr, + TenantNameRef tenantName, + TenantMapEntry originalEntry, + TenantMapEntry updatedTenantEntry) { + ASSERT(updatedTenantEntry.id == originalEntry.id); tr->setOption(FDBTransactionOptions::RAW_ACCESS); + TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry); - state typename transaction_future_type::type listFuture = - tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit); - RangeResult results = wait(safeThreadFutureToFuture(listFuture)); + // If the tenant group was changed, we need to update the tenant group metadata structures + if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) { + if (updatedTenantEntry.tenantGroup.present() && updatedTenantEntry.tenantGroup.get().startsWith("\xff"_sr)) { + throw invalid_tenant_group_name(); + } + if (originalEntry.tenantGroup.present()) { + // Remove this tenant from the original tenant group index + TenantMetadata::tenantGroupTenantIndex().erase( + tr, Tuple::makeTuple(originalEntry.tenantGroup.get(), tenantName)); - std::map tenants; - for (auto kv : results) { - tenants[kv.key.removePrefix(tenantMapPrefix)] = TenantMapEntry::decode(kv.value); + // Check if the original tenant group is now empty. If so, remove the tenant group. + KeyBackedSet::RangeResultType tenants = wait(TenantMetadata::tenantGroupTenantIndex().getRange( + tr, + Tuple::makeTuple(originalEntry.tenantGroup.get()), + Tuple::makeTuple(keyAfter(originalEntry.tenantGroup.get())), + 2)); + + if (tenants.results.empty() || + (tenants.results.size() == 1 && tenants.results[0].getString(1) == tenantName)) { + TenantMetadata::tenantGroupMap().erase(tr, originalEntry.tenantGroup.get()); + } + } + if (updatedTenantEntry.tenantGroup.present()) { + // If this is creating a new tenant group, add it to the tenant group map + Optional entry = + wait(TenantMetadata::tenantGroupMap().get(tr, updatedTenantEntry.tenantGroup.get())); + if (!entry.present()) { + TenantMetadata::tenantGroupMap().set(tr, updatedTenantEntry.tenantGroup.get(), TenantGroupEntry()); + } + + // Insert this tenant in the tenant group index + TenantMetadata::tenantGroupTenantIndex().insert( + tr, Tuple::makeTuple(updatedTenantEntry.tenantGroup.get(), tenantName)); + } } - return tenants; + return Void(); +} + +ACTOR template +Future>> listTenantsTransaction(Transaction tr, + TenantName begin, + TenantName end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(TenantMetadata::tenantMap().getRange(tr, begin, end, limit)); + + return results.results; } ACTOR template -Future> listTenants(Reference db, - TenantName begin, - TenantName end, - int limit) { +Future>> listTenants(Reference db, + TenantName begin, + TenantName end, + int limit) { state Reference tr = db->createTransaction(); loop { try { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); - std::map tenants = wait(listTenantsTransaction(tr, begin, end, limit)); + std::vector> tenants = + wait(listTenantsTransaction(tr, begin, end, limit)); return tenants; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); @@ -280,12 +493,60 @@ Future> listTenants(Reference db, } } -ACTOR template -Future renameTenant(Reference db, TenantName oldName, TenantName newName) { - state Reference tr = db->createTransaction(); +ACTOR template +Future renameTenantTransaction(Transaction tr, + TenantName oldName, + TenantName newName, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE, + Optional configureSequenceNum = Optional()) { + ASSERT(clusterType == ClusterType::STANDALONE || (tenantId.present() && configureSequenceNum.present())); + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); + wait(checkTenantMode(tr, clusterType)); + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + state Optional oldEntry; + state Optional newEntry; + wait(store(oldEntry, tryGetTenantTransaction(tr, oldName)) && + store(newEntry, tryGetTenantTransaction(tr, newName))); + if (!oldEntry.present() || (tenantId.present() && tenantId.get() != oldEntry.get().id)) { + throw tenant_not_found(); + } + if (newEntry.present()) { + throw tenant_already_exists(); + } + if (configureSequenceNum.present()) { + if (oldEntry.get().configurationSequenceNum >= configureSequenceNum.get()) { + return Void(); + } + oldEntry.get().configurationSequenceNum = configureSequenceNum.get(); + } + TenantMetadata::tenantMap().erase(tr, oldName); + TenantMetadata::tenantMap().set(tr, newName, oldEntry.get()); + TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName); + + // Update the tenant group index to reflect the new tenant name + if (oldEntry.get().tenantGroup.present()) { + TenantMetadata::tenantGroupTenantIndex().erase(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), oldName)); + TenantMetadata::tenantGroupTenantIndex().insert(tr, + Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName)); + } + + if (clusterType == ClusterType::METACLUSTER_DATA) { + wait(markTenantTombstones(tr, tenantId.get())); + } + + return Void(); +} + +ACTOR template +Future renameTenant(Reference db, + TenantName oldName, + TenantName newName, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { + state Reference tr = db->createTransaction(); + ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present()); - state Key oldNameKey = oldName.withPrefix(tenantMapPrefix); - state Key newNameKey = newName.withPrefix(tenantMapPrefix); state bool firstTry = true; state int64_t id; loop { @@ -328,9 +589,8 @@ Future renameTenant(Reference db, TenantName oldName, TenantName newNa throw tenant_not_found(); } } - tr->clear(oldNameKey); - tr->set(newNameKey, oldEntry.get().encode()); - wait(safeThreadFutureToFuture(tr->commit())); + wait(renameTenantTransaction(tr, oldName, newName, tenantId, clusterType)); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); TraceEvent("RenameTenantSuccess").detail("OldName", oldName).detail("NewName", newName); return Void(); } catch (Error& e) { @@ -338,6 +598,62 @@ Future renameTenant(Reference db, TenantName oldName, TenantName newNa } } } + +template +Future> tryGetTenantGroupTransaction(Transaction tr, TenantGroupName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + return TenantMetadata::tenantGroupMap().get(tr, name); +} + +ACTOR template +Future> tryGetTenantGroup(Reference db, TenantGroupName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional entry = wait(tryGetTenantGroupTransaction(tr, name)); + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future>> listTenantGroupsTransaction(Transaction tr, + TenantGroupName begin, + TenantGroupName end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(TenantMetadata::tenantGroupMap().getRange(tr, begin, end, limit)); + + return results.results; +} + +ACTOR template +Future>> listTenantGroups(Reference db, + TenantGroupName begin, + TenantGroupName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + std::vector> tenantGroups = + wait(listTenantGroupsTransaction(tr, begin, end, limit)); + return tenantGroups; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + } // namespace TenantAPI #include "flow/unactorcompiler.h" diff --git a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h index 73362deb45..cf190fc77d 100644 --- a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h +++ b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h @@ -31,12 +31,11 @@ #include "fdbclient/DatabaseContext.h" #include "fdbclient/SpecialKeySpace.actor.h" #include "fdbclient/TenantManagement.actor.h" -#include "fdbclient/libb64/encode.h" +#include "fdbclient/Tuple.h" #include "flow/Arena.h" #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // This must be the last #include. -template class TenantRangeImpl : public SpecialKeyRangeRWImpl { private: static KeyRangeRef removePrefix(KeyRangeRef range, KeyRef prefix, KeyRef defaultEnd) { @@ -53,15 +52,14 @@ private: static KeyRef withTenantMapPrefix(KeyRef key, Arena& ar) { int keySize = SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() + - TenantRangeImpl::submoduleRange.begin.size() + TenantRangeImpl::mapSubRange.begin.size() + - key.size(); + submoduleRange.begin.size() + mapSubRange.begin.size() + key.size(); KeyRef prefixedKey = makeString(keySize, ar); uint8_t* mutableKey = mutateString(prefixedKey); mutableKey = SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.copyTo(mutableKey); - mutableKey = TenantRangeImpl::submoduleRange.begin.copyTo(mutableKey); - mutableKey = TenantRangeImpl::mapSubRange.begin.copyTo(mutableKey); + mutableKey = submoduleRange.begin.copyTo(mutableKey); + mutableKey = mapSubRange.begin.copyTo(mutableKey); key.copyTo(mutableKey); return prefixedKey; @@ -71,27 +69,12 @@ private: KeyRangeRef kr, RangeResult* results, GetRangeLimits limitsHint) { - std::map tenants = + std::vector> tenants = wait(TenantAPI::listTenantsTransaction(&ryw->getTransaction(), kr.begin, kr.end, limitsHint.rows)); for (auto tenant : tenants) { - json_spirit::mObject tenantEntry; - tenantEntry["id"] = tenant.second.id; - if (ryw->getDatabase()->apiVersionAtLeast(720)) { - json_spirit::mObject prefixObject; - std::string encodedPrefix = base64::encoder::from_string(tenant.second.prefix.toString()); - // Remove trailing newline - encodedPrefix.resize(encodedPrefix.size() - 1); - - prefixObject["base64"] = encodedPrefix; - prefixObject["printable"] = printable(tenant.second.prefix); - tenantEntry["prefix"] = prefixObject; - } else { - // This is not a standard encoding in JSON, and some libraries may not be able to easily decode it - tenantEntry["prefix"] = tenant.second.prefix.toString(); - } - std::string tenantEntryString = json_spirit::write_string(json_spirit::mValue(tenantEntry)); - ValueRef tenantEntryBytes(results->arena(), tenantEntryString); + std::string jsonString = tenant.second.toJson(); + ValueRef tenantEntryBytes(results->arena(), jsonString); results->push_back(results->arena(), KeyValueRef(withTenantMapPrefix(tenant.first, results->arena()), tenantEntryBytes)); } @@ -120,25 +103,108 @@ private: return results; } - ACTOR static Future createTenants(ReadYourWritesTransaction* ryw, std::vector tenants) { - int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction())); - int64_t nextId = _nextId; + // Returns true if the tenant was created, false if it already existed + ACTOR static Future createTenant( + ReadYourWritesTransaction* ryw, + TenantNameRef tenantName, + std::vector, Optional>> configMutations, + int64_t tenantId, + std::map* tenantGroupNetTenantDelta) { + state TenantMapEntry tenantEntry; + tenantEntry.setId(tenantId); + tenantEntry.encrypted = ryw->getTransactionState()->cx->clientInfo->get().isEncryptionEnabled; - std::vector> createFutures; - for (auto tenant : tenants) { - createFutures.push_back( - success(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenant, nextId++))); + for (auto const& [name, value] : configMutations) { + tenantEntry.configure(name, value); } - ryw->getTransaction().set(tenantLastIdKey, TenantMapEntry::idToPrefix(nextId - 1)); + if (tenantEntry.tenantGroup.present()) { + (*tenantGroupNetTenantDelta)[tenantEntry.tenantGroup.get()]++; + } + + std::pair, bool> entry = + wait(TenantAPI::createTenantTransaction(&ryw->getTransaction(), tenantName, tenantEntry)); + + return entry.second; + } + + ACTOR static Future createTenants( + ReadYourWritesTransaction* ryw, + std::map, Optional>>> tenants, + std::map* tenantGroupNetTenantDelta) { + state Future tenantCountFuture = + TenantMetadata::tenantCount().getD(&ryw->getTransaction(), Snapshot::False, 0); + int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction())); + state int64_t nextId = _nextId; + + state std::vector> createFutures; + for (auto const& [tenant, config] : tenants) { + createFutures.push_back(createTenant(ryw, tenant, config, nextId++, tenantGroupNetTenantDelta)); + } + + TenantMetadata::lastTenantId().set(&ryw->getTransaction(), nextId - 1); wait(waitForAll(createFutures)); + + state int numCreatedTenants = 0; + for (auto f : createFutures) { + if (f.get()) { + ++numCreatedTenants; + } + } + + // Check the tenant count here rather than rely on the createTenantTransaction check because we don't have RYW + int64_t tenantCount = wait(tenantCountFuture); + if (tenantCount + numCreatedTenants > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + return Void(); + } + + ACTOR static Future changeTenantConfig( + ReadYourWritesTransaction* ryw, + TenantName tenantName, + std::vector, Optional>> configEntries, + std::map* tenantGroupNetTenantDelta) { + TenantMapEntry originalEntry = wait(TenantAPI::getTenantTransaction(&ryw->getTransaction(), tenantName)); + TenantMapEntry updatedEntry = originalEntry; + for (auto const& [name, value] : configEntries) { + updatedEntry.configure(name, value); + } + + if (originalEntry.tenantGroup != updatedEntry.tenantGroup) { + if (originalEntry.tenantGroup.present()) { + (*tenantGroupNetTenantDelta)[originalEntry.tenantGroup.get()]--; + } + if (updatedEntry.tenantGroup.present()) { + (*tenantGroupNetTenantDelta)[updatedEntry.tenantGroup.get()]++; + } + } + + wait(TenantAPI::configureTenantTransaction(&ryw->getTransaction(), tenantName, originalEntry, updatedEntry)); + return Void(); + } + + ACTOR static Future deleteSingleTenant(ReadYourWritesTransaction* ryw, + TenantName tenantName, + std::map* tenantGroupNetTenantDelta) { + state Optional tenantEntry = + wait(TenantAPI::tryGetTenantTransaction(&ryw->getTransaction(), tenantName)); + if (tenantEntry.present()) { + wait(TenantAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName)); + if (tenantEntry.get().tenantGroup.present()) { + (*tenantGroupNetTenantDelta)[tenantEntry.get().tenantGroup.get()]--; + } + } + return Void(); } ACTOR static Future deleteTenantRange(ReadYourWritesTransaction* ryw, TenantName beginTenant, - TenantName endTenant) { - state std::map tenants = wait( + TenantName endTenant, + std::map* tenantGroupNetTenantDelta) { + state std::vector> tenants = wait( TenantAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY)); if (tenants.size() == CLIENT_KNOBS->TOO_MANY) { @@ -153,15 +219,41 @@ private: std::vector> deleteFutures; for (auto tenant : tenants) { deleteFutures.push_back(TenantAPI::deleteTenantTransaction(&ryw->getTransaction(), tenant.first)); + if (tenant.second.tenantGroup.present()) { + (*tenantGroupNetTenantDelta)[tenant.second.tenantGroup.get()]--; + } } + wait(waitForAll(deleteFutures)); + return Void(); + } + + // Check if the number of tenants in the tenant group is equal to the net reduction in the number of tenants. + // If it is, then we can delete the tenant group. + ACTOR static Future checkAndRemoveTenantGroup(ReadYourWritesTransaction* ryw, + TenantGroupName tenantGroup, + int tenantDelta) { + ASSERT(tenantDelta < 0); + state int removedTenants = -tenantDelta; + KeyBackedSet::RangeResultType tenantsInGroup = + wait(TenantMetadata::tenantGroupTenantIndex().getRange(&ryw->getTransaction(), + Tuple::makeTuple(tenantGroup), + Tuple::makeTuple(keyAfter(tenantGroup)), + removedTenants + 1)); + + ASSERT(tenantsInGroup.results.size() >= removedTenants); + if (tenantsInGroup.results.size() == removedTenants) { + TenantMetadata::tenantGroupMap().erase(&ryw->getTransaction(), tenantGroup); + } return Void(); } public: - const static KeyRangeRef submoduleRange; - const static KeyRangeRef mapSubRange; + const inline static KeyRangeRef submoduleRange = KeyRangeRef("tenant/"_sr, "tenant0"_sr); + const inline static KeyRangeRef mapSubRange = KeyRangeRef("map/"_sr, "map0"_sr); + const inline static KeyRangeRef configureSubRange = KeyRangeRef("configure/"_sr, "configure0"_sr); + const inline static KeyRangeRef renameSubRange = KeyRangeRef("rename/"_sr, "rename0"_sr); explicit TenantRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {} @@ -171,18 +263,33 @@ public: return getTenantRange(ryw, kr, limitsHint); } - Future> commit(ReadYourWritesTransaction* ryw) override { - auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range); - std::vector> tenantManagementFutures; + ACTOR static Future> commitImpl(TenantRangeImpl* self, ReadYourWritesTransaction* ryw) { + state std::vector> tenantManagementFutures; - std::vector>> mapMutations; + // This map is an ugly workaround to the fact that we cannot use RYW in these transactions. + // It tracks the net change to the number of tenants in a tenant group, and at the end we can compare + // that with how many tenants the tenant group started with. If we removed all of the tenants, then we + // delete the tenant group. + // + // SOMEDAY: enable RYW support in special keys and remove this complexity. + state std::map tenantGroupNetTenantDelta; + + state KeyRangeMap>>::Ranges ranges = + ryw->getSpecialKeySpaceWriteMap().containedRanges(self->range); + + state std::vector>> mapMutations; + state std::map, Optional>>> configMutations; + state std::set renameSet; + state std::vector> renameMutations; + + tenantManagementFutures.push_back(TenantAPI::checkTenantMode(&ryw->getTransaction(), ClusterType::STANDALONE)); for (auto range : ranges) { if (!range.value().first) { continue; } - KeyRangeRef adjustedRange = + state KeyRangeRef adjustedRange = range.range() .removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) .removePrefix(submoduleRange.begin); @@ -191,31 +298,104 @@ public: adjustedRange = mapSubRange & adjustedRange; adjustedRange = removePrefix(adjustedRange, mapSubRange.begin, "\xff"_sr); mapMutations.push_back(std::make_pair(adjustedRange, range.value().second)); + } else if (configureSubRange.intersects(adjustedRange) && adjustedRange.singleKeyRange()) { + StringRef configTupleStr = adjustedRange.begin.removePrefix(configureSubRange.begin); + try { + Tuple tuple = Tuple::unpack(configTupleStr); + if (tuple.size() != 2) { + throw invalid_tuple_index(); + } + configMutations[tuple.getString(0)].push_back( + std::make_pair(tuple.getString(1), range.value().second)); + } catch (Error& e) { + TraceEvent(SevWarn, "InvalidTenantConfigurationKey").error(e).detail("Key", adjustedRange.begin); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString( + false, "configure tenant", "invalid tenant configuration key")); + throw special_keys_api_failure(); + } + } else if (renameSubRange.intersects(adjustedRange)) { + StringRef oldName = adjustedRange.begin.removePrefix(renameSubRange.begin); + StringRef newName = range.value().second.get(); + // Do not allow overlapping renames in the same commit + // e.g. A->B + B->C, D->D + if (renameSet.count(oldName) || renameSet.count(newName) || oldName == newName) { + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "rename tenant", "tenant rename conflict")); + throw special_keys_api_failure(); + } + renameSet.insert(oldName); + renameSet.insert(newName); + renameMutations.push_back(std::make_pair(oldName, newName)); } } - std::vector tenantsToCreate; + std::map, Optional>>> tenantsToCreate; for (auto mapMutation : mapMutations) { TenantNameRef tenantName = mapMutation.first.begin; + auto set_iter = renameSet.lower_bound(tenantName); + if (set_iter != renameSet.end() && mapMutation.first.contains(*set_iter)) { + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "rename tenant", "tenant rename conflict")); + throw special_keys_api_failure(); + } if (mapMutation.second.present()) { - tenantsToCreate.push_back(tenantName); + std::vector, Optional>> createMutations; + auto itr = configMutations.find(tenantName); + if (itr != configMutations.end()) { + createMutations = itr->second; + configMutations.erase(itr); + } + tenantsToCreate[tenantName] = createMutations; } else { // For a single key clear, just issue the delete if (mapMutation.first.singleKeyRange()) { - tenantManagementFutures.push_back( - TenantAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName)); + tenantManagementFutures.push_back(deleteSingleTenant(ryw, tenantName, &tenantGroupNetTenantDelta)); + + // Configuration changes made to a deleted tenant are discarded + configMutations.erase(tenantName); } else { - tenantManagementFutures.push_back(deleteTenantRange(ryw, tenantName, mapMutation.first.end)); + tenantManagementFutures.push_back( + deleteTenantRange(ryw, tenantName, mapMutation.first.end, &tenantGroupNetTenantDelta)); + + // Configuration changes made to a deleted tenant are discarded + configMutations.erase(configMutations.lower_bound(tenantName), + configMutations.lower_bound(mapMutation.first.end)); } } } if (!tenantsToCreate.empty()) { - tenantManagementFutures.push_back(createTenants(ryw, tenantsToCreate)); + tenantManagementFutures.push_back(createTenants(ryw, tenantsToCreate, &tenantGroupNetTenantDelta)); + } + for (auto configMutation : configMutations) { + if (renameSet.count(configMutation.first)) { + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "rename tenant", "tenant rename conflict")); + throw special_keys_api_failure(); + } + tenantManagementFutures.push_back( + changeTenantConfig(ryw, configMutation.first, configMutation.second, &tenantGroupNetTenantDelta)); } - return tag(waitForAll(tenantManagementFutures), Optional()); + for (auto renameMutation : renameMutations) { + tenantManagementFutures.push_back(TenantAPI::renameTenantTransaction( + &ryw->getTransaction(), renameMutation.first, renameMutation.second)); + } + + wait(waitForAll(tenantManagementFutures)); + + state std::vector> tenantGroupUpdateFutures; + for (auto [tenantGroup, count] : tenantGroupNetTenantDelta) { + if (count < 0) { + tenantGroupUpdateFutures.push_back(checkAndRemoveTenantGroup(ryw, tenantGroup, count)); + } + } + + wait(waitForAll(tenantGroupUpdateFutures)); + return Optional(); } + + Future> commit(ReadYourWritesTransaction* ryw) override { return commitImpl(this, ryw); } }; #include "flow/unactorcompiler.h" diff --git a/fdbclient/include/fdbclient/ThreadSafeTransaction.h b/fdbclient/include/fdbclient/ThreadSafeTransaction.h index 875664ea76..0d7c6f608d 100644 --- a/fdbclient/include/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/include/fdbclient/ThreadSafeTransaction.h @@ -20,6 +20,7 @@ #ifndef FDBCLIENT_THREADSAFETRANSACTION_H #define FDBCLIENT_THREADSAFETRANSACTION_H +#include "flow/ApiVersion.h" #include "flow/ProtocolVersion.h" #pragma once @@ -62,6 +63,13 @@ public: ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override; ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) override; + ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; + + ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) override; + ThreadFuture createSharedState() override; void setSharedState(DatabaseSharedState* p) override; @@ -72,7 +80,8 @@ private: DatabaseContext* db; public: // Internal use only - ThreadSafeDatabase(Reference connectionRecord, int apiVersion); + enum class ConnectionRecordType { FILE, CONNECTION_STRING }; + ThreadSafeDatabase(ConnectionRecordType connectionRecordType, std::string connectionRecord, int apiVersion); ThreadSafeDatabase(DatabaseContext* db) : db(db) {} DatabaseContext* unsafeGetPtr() const { return db; } }; @@ -148,13 +157,30 @@ public: ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) override; - ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override; + ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + + ThreadFuture>> summarizeBlobGranules(const KeyRangeRef& keyRange, + Optional summaryVersion, + int rangeLimit) override; + void addReadConflictRange(const KeyRangeRef& keys) override; void makeSelfConflicting(); @@ -205,6 +231,7 @@ class ThreadSafeApi : public IClientApi, ThreadSafeReferenceCounted value = Optional()) override; void setupNetwork() override; @@ -220,8 +247,8 @@ private: friend IClientApi* getLocalClientAPI(); ThreadSafeApi(); - int apiVersion; - const std::string clientVersion; + ApiVersion apiVersion; + std::string clientVersion; uint64_t transportId; Mutex lock; diff --git a/fdbclient/include/fdbclient/Tracing.h b/fdbclient/include/fdbclient/Tracing.h index 30fa210e3f..789b346dfd 100644 --- a/fdbclient/include/fdbclient/Tracing.h +++ b/fdbclient/include/fdbclient/Tracing.h @@ -50,8 +50,7 @@ struct SpanContext { SpanContext() : traceID(UID()), spanID(0), m_Flags(TraceFlags::unsampled) {} SpanContext(UID traceID, uint64_t spanID, TraceFlags flags) : traceID(traceID), spanID(spanID), m_Flags(flags) {} SpanContext(UID traceID, uint64_t spanID) : traceID(traceID), spanID(spanID), m_Flags(TraceFlags::unsampled) {} - SpanContext(Arena arena, const SpanContext& span) - : traceID(span.traceID), spanID(span.spanID), m_Flags(span.m_Flags) {} + SpanContext(const SpanContext& span) = default; bool isSampled() const { return (m_Flags & TraceFlags::sampled) == TraceFlags::sampled; } std::string toString() const { return format("%016llx%016llx%016llx", traceID.first(), traceID.second(), spanID); }; bool isValid() const { return traceID.first() != 0 && traceID.second() != 0 && spanID != 0; } @@ -62,6 +61,9 @@ struct SpanContext { } }; +template <> +struct flow_ref : std::false_type {}; + // Span // // Span is a tracing implementation which, for the most part, complies with the W3C Trace Context specification @@ -155,7 +157,7 @@ public: // We've determined for initial tracing release, spans with only a location will not be traced. // Generally these are for background processes, some are called infrequently, while others may be high volume. // TODO: review and address in subsequent PRs. - Span(const Location& location) : location(location), begin(g_network->now()) {} + explicit Span(const Location& location) : Span(location, SpanContext()) {} Span(const Span&) = delete; Span(Span&& o) { diff --git a/fdbclient/include/fdbclient/Tuple.h b/fdbclient/include/fdbclient/Tuple.h index 4c52c1ebc6..bf997e4309 100644 --- a/fdbclient/include/fdbclient/Tuple.h +++ b/fdbclient/include/fdbclient/Tuple.h @@ -28,6 +28,19 @@ #include "fdbclient/Versionstamp.h" struct Tuple { + struct UnicodeStr { + StringRef str; + explicit UnicodeStr(StringRef str) : str(str) {} + }; + + struct UserTypeStr { + uint8_t code; + Standalone str; + UserTypeStr(uint8_t code, StringRef str) : code(code), str(str) {} + + bool operator==(const UserTypeStr& other) const { return (code == other.code && str == other.str); } + }; + Tuple() {} // Tuple parsing normally does not care of the final value is a numeric type and is incomplete. @@ -35,35 +48,44 @@ struct Tuple { // Note that strings can't be incomplete because they are parsed such that the end of the packed // byte string is considered the end of the string in lieu of a specific end. static Tuple unpack(StringRef const& str, bool exclude_incomplete = false); + static Tuple unpackUserType(StringRef const& str, bool exclude_incomplete = false); Tuple& append(Tuple const& tuple); // the str needs to be a Tuple encoded string. Tuple& appendRaw(StringRef const& str); Tuple& append(StringRef const& str, bool utf8 = false); + Tuple& append(UnicodeStr const& str); + Tuple& append(int32_t); Tuple& append(int64_t); - // There are some ambiguous append calls in fdbclient, so to make it easier - // to add append for floats and doubles, name them differently for now. - Tuple& appendBool(bool); - Tuple& appendFloat(float); - Tuple& appendDouble(double); + Tuple& append(bool); + Tuple& append(float); + Tuple& append(double); + Tuple& append(std::nullptr_t); Tuple& appendNull(); - Tuple& appendVersionstamp(Versionstamp const&); + Tuple& append(Versionstamp const&); + Tuple& append(UserTypeStr const&); - StringRef pack() const { return StringRef(data.begin(), data.size()); } + Standalone pack() const { + return Standalone(StringRef(data.begin(), data.size()), data.arena()); + } template Tuple& operator<<(T const& t) { return append(t); } - enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE, VERSIONSTAMP }; + enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE, VERSIONSTAMP, USER_TYPE }; + + bool isUserType(uint8_t code) const; // this is number of elements, not length of data size_t size() const { return offsets.size(); } void reserve(size_t cap) { offsets.reserve(cap); } void clear() { - data.clear(); + // Make a new Standalone to use different memory so that + // previously returned objects from pack() are valid. + data = Standalone>(); offsets.clear(); } // Return a Tuple encoded raw string. @@ -75,6 +97,7 @@ struct Tuple { bool getBool(size_t index) const; float getFloat(size_t index) const; double getDouble(size_t index) const; + Tuple::UserTypeStr getUserType(size_t index) const; KeyRange range(Tuple const& tuple = Tuple()) const; @@ -84,8 +107,20 @@ struct Tuple { Standalone> getData() { return data; } Standalone getDataAsStandalone() { return Standalone(pack(), data.arena()); } + // Create a tuple from a parameter pack + template + static Tuple makeTuple(Types&&... args) { + Tuple t; + + // Use a fold expression to append each argument using the << operator. + // https://en.cppreference.com/w/cpp/language/fold + (t << ... << std::forward(args)); + + return t; + } + private: - Tuple(const StringRef& data, bool exclude_incomplete = false); + Tuple(const StringRef& data, bool exclude_incomplete = false, bool exclude_user_type = false); Standalone> data; std::vector offsets; }; diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 57ae180a07..8df1bcc150 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -57,6 +57,8 @@ description is not currently required but encouraged.