Merge remote-tracking branch 'origin/main' into flow-enhancements

This commit is contained in:
sfc-gh-tclinkenbeard 2022-02-15 15:56:42 -08:00
commit a32547cbbe
221 changed files with 12698 additions and 9672 deletions

3
.gitignore vendored
View File

@ -9,8 +9,7 @@ bindings/java/foundationdb-tests*.jar
bindings/java/fdb-java-*-sources.jar
packaging/msi/FDBInstaller.msi
build/
cmake-build-debug/
cmake-build-release/
cmake-build-*/
# Generated source, build, and packaging files
*.g.cpp

View File

@ -30,8 +30,6 @@ At project launch, FoundationDB has a light governance structure. The intention
We draw inspiration from the Apache Software Foundation's informal motto: ["community over code"](https://blogs.apache.org/foundation/entry/asf_15_community_over_code), and their emphasis on meritocratic rules. You'll also observe that some initial community structure is [inspired by the Swift community](https://swift.org/community/#community-structure).
The project technical lead is Evan Tschannen (ejt@apple.com).
Members of the Apple FoundationDB team are part of the core committers helping review individual contributions; you'll see them commenting on your pull requests. As the FDB open source community has grown, some members of the community have consistently produced high quality code reviews and other significant contributions to FoundationDB. The project technical lead maintains a list of external committers that actively contribute in this way, and gives them permission to review and merge pull requests.
## Contributing

View File

@ -168,32 +168,17 @@ $SRCDIR/packaging/osx/buildpkg.sh . $SRCDIR
### Windows
Under Windows, the build instructions are very similar, with the main difference
that Visual Studio is used to compile.
Under Windows, only Visual Studio with ClangCl is supported
1. Install Visual Studio 2017 (Community Edition is tested)
1. Install cmake Version 3.12 or higher [CMake](https://cmake.org/)
1. Download version 1.72 of [Boost](https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2)
1. Unpack boost (you don't need to compile it)
1. Install [Mono](http://www.mono-project.com/download/stable/)
1. (Optional) Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8
1. Set `JAVA_HOME` to the unpacked location and JAVA_COMPILE to
`$JAVA_HOME/bin/javac`.
1. Install [Python](https://www.python.org/downloads/) if it is not already installed by Visual Studio
1. (Optional) Install [WIX](http://wixtoolset.org/). Without it Visual Studio
won't build the Windows installer
1. Create a build directory (you can have the build directory anywhere you
like): `mkdir build`
1. `cd build`
1. `cmake -G "Visual Studio 15 2017 Win64" -DBOOST_ROOT=<PATH_TO_BOOST> <PATH_TO_FOUNDATIONDB_DIRECTORY>`
1. This should succeed. In which case you can build using msbuild:
`msbuild /p:Configuration=Release foundationdb.sln`. You can also open the resulting
solution in Visual Studio and compile from there. However, be aware that
using Visual Studio for development is currently not supported as Visual
Studio will only know about the generated files. `msbuild` is located at
`c:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe` for Visual Studio 15.
If you installed WIX before running `cmake` you should find the
`FDBInstaller.msi` in your build directory under `packaging/msi`.
TODO: Re-add instructions for TLS support [#3022](https://github.com/apple/foundationdb/issues/3022)
1. Install Visual Studio 2019 (IDE or Build Tools), and enable llvm support
1. Install [CMake 3.15](https://cmake.org/) or higher
1. Download [Boost 1.77.0](https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.7z)
1. Unpack boost to C:\boost, or use `-DBOOST_ROOT=<PATH_TO_BOOST>` with `cmake` if unpacked elsewhere
1. Install [Python](https://www.python.org/downloads/) if is not already installed by Visual Studio
1. (Optional) Install [OpenJDK 11](https://developers.redhat.com/products/openjdk/download) to build Java bindings
1. (Optional) Install [OpenSSL 3.x](https://slproweb.com/products/Win32OpenSSL.html) to build with TLS support
1. (Optional) Install [WIX Toolset](http://wixtoolset.org/) to build Windows installer
1. `mkdir build && cd build`
1. `cmake -G "Visual Studio 16 2019" -A x64 -T ClangCl <PATH_TO_FOUNDATIONDB_SOURCE>`
1. `msbuild /p:Configuration=Release foundationdb.sln`
1. To increase build performance, use `/p:UseMultiToolTask=true` and `/p:CL_MPCount=<NUMBER_OF_PARALLEL_JOBS>`

View File

@ -829,6 +829,10 @@ retryTxn:
}
docommit = 1;
break;
case OP_OVERWRITE:
rc = run_op_insert(transaction, keystr, valstr);
docommit = 1;
break;
case OP_CLEAR:
rc = run_op_clear(transaction, keystr);
docommit = 1;
@ -1212,6 +1216,9 @@ void get_stats_file_name(char filename[], int worker_id, int thread_id, int op)
case OP_INSERTRANGE:
strcat(filename, "INSERTRANGE");
break;
case OP_OVERWRITE:
strcat(filename, "OVERWRITE");
break;
case OP_CLEAR:
strcat(filename, "CLEAR");
break;
@ -1502,7 +1509,7 @@ int worker_process_main(mako_args_t* args, int worker_id, mako_shmhdr_t* shm, pi
/*** let's party! ***/
/* set up cluster and datbase for workder threads */
/* set up cluster and datbase for worker threads */
#if FDB_API_VERSION < 610
/* cluster */
@ -1704,6 +1711,9 @@ int parse_transaction(mako_args_t* args, char* optarg) {
} else if (strncmp(ptr, "i", 1) == 0) {
op = OP_INSERT;
ptr++;
} else if (strncmp(ptr, "o", 1) == 0) {
op = OP_OVERWRITE;
ptr++;
} else if (strncmp(ptr, "cr", 2) == 0) {
op = OP_CLEARRANGE;
rangeop = 1;
@ -2107,6 +2117,8 @@ char* get_ops_name(int ops_code) {
return "INSERT";
case OP_INSERTRANGE:
return "INSERTRANGE";
case OP_OVERWRITE:
return "OVERWRITE";
case OP_CLEAR:
return "CLEAR";
case OP_SETCLEAR:
@ -2891,6 +2903,8 @@ int main(int argc, char* argv[]) {
mako_stats_t* stats;
pid_t pid_main;
setlinebuf(stdout);
rc = init_args(&args);
if (rc < 0) {
fprintf(stderr, "ERROR: init_args failed\n");

View File

@ -45,6 +45,7 @@ enum Operations {
OP_UPDATE,
OP_INSERT,
OP_INSERTRANGE,
OP_OVERWRITE,
OP_CLEAR,
OP_SETCLEAR,
OP_CLEARRANGE,

View File

@ -138,6 +138,7 @@ Operation Types
- ``u`` Update (= GET followed by SET)
- ``i`` Insert (= SET with a new key)
- ``ir`` Insert Range (Sequential)
- ``o`` Overwrite (Blind write to existing keys)
- ``c`` CLEAR
- ``sc`` SET & CLEAR
- ``cr`` CLEAR RANGE

View File

@ -72,7 +72,7 @@ int main(int argc, char** argv) {
// Apparently you need to open a database to initialize logging
FDBDatabase* out;
fdb_check(fdb_create_database(nullptr, &out));
fdb_check(fdb_create_database(argv[1], &out));
fdb_database_destroy(out);
// Eventually there's a new trace file for this test ending in .tmp

View File

@ -69,7 +69,7 @@ add_custom_target(go_options_file DEPENDS ${go_options_file})
add_dependencies(go_options_file copy_go_sources)
function(build_go_package)
set(options LIBRARY EXECUTABLE)
set(options LIBRARY EXECUTABLE INCLUDE_TEST)
set(oneValueArgs NAME PATH)
set(multiValueArgs)
cmake_parse_arguments(BGP "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
@ -106,12 +106,30 @@ function(build_go_package)
DEPENDS ${fdb_options_file}
COMMENT "Building ${BGP_NAME}")
add_custom_target(${BGP_NAME} ALL DEPENDS ${outfile})
if(BGP_INCLUDE_TEST)
set(testfile ${CMAKE_CURRENT_BINARY_DIR}/${BGP_NAME}_test)
add_custom_command(OUTPUT ${testfile}
COMMAND ${CMAKE_COMMAND} -E env ${go_env}
${GO_EXECUTABLE} test -c ${GO_IMPORT_PATH}/${BGP_PATH} -o ${testfile}
DEPENDS ${fdb_options_file} fdb_c ${BGP_NAME}
COMMENT "Building ${BGP_NAME} test")
add_custom_target(${BGP_NAME}_test_target ALL DEPENDS ${testfile})
set(library_path LD_LIBRARY_PATH)
if (APPLE)
set(library_path DYLD_LIBRARY_PATH)
endif()
add_fdbclient_test(
NAME ${BGP_PATH}_go_test
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/${BGP_PATH}
COMMAND ${CMAKE_COMMAND} -E env ${library_path}=${CMAKE_BINARY_DIR}/lib ${testfile}
)
endif()
endfunction()
build_go_package(LIBRARY NAME fdb_go PATH fdb)
build_go_package(LIBRARY NAME fdb_go PATH fdb INCLUDE_TEST)
add_dependencies(fdb_go fdb_c go_options_file)
build_go_package(LIBRARY NAME tuple_go PATH fdb/tuple)
build_go_package(LIBRARY NAME tuple_go PATH fdb/tuple INCLUDE_TEST)
add_dependencies(tuple_go fdb_go)
build_go_package(LIBRARY NAME subspace_go PATH fdb/subspace)

View File

@ -22,11 +22,12 @@ package main
import (
"bytes"
"strings"
"github.com/apple/foundationdb/bindings/go/src/fdb"
"github.com/apple/foundationdb/bindings/go/src/fdb/directory"
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
"github.com/apple/foundationdb/bindings/go/src/fdb/tuple"
"strings"
)
func (sm *StackMachine) popTuples(count int) []tuple.Tuple {

View File

@ -25,10 +25,11 @@ package directory
import (
"bytes"
"encoding/binary"
"github.com/apple/foundationdb/bindings/go/src/fdb"
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
"math/rand"
"sync"
"github.com/apple/foundationdb/bindings/go/src/fdb"
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
)
var oneBytes = []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}

View File

@ -25,6 +25,7 @@ package directory
import (
"fmt"
"strings"
"github.com/apple/foundationdb/bindings/go/src/fdb"
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
)

View File

@ -24,6 +24,7 @@ package directory
import (
"bytes"
"github.com/apple/foundationdb/bindings/go/src/fdb"
"github.com/apple/foundationdb/bindings/go/src/fdb/subspace"
)

View File

@ -97,6 +97,19 @@ func TestVersionstamp(t *testing.T) {
t.Logf("setOne returned %s", k)
}
func TestReadTransactionOptions(t *testing.T) {
fdb.MustAPIVersion(710)
db := fdb.MustOpenDefault()
_, e := db.ReadTransact(func(rtr fdb.ReadTransaction) (interface{}, error) {
rtr.Options().SetAccessSystemKeys();
return rtr.Get(fdb.Key("\xff/")).MustGet(), nil
})
if e != nil {
t.Errorf("Failed to read system key: %s", e)
}
}
func ExampleTransactor() {
fdb.MustAPIVersion(710)
db := fdb.MustOpenDefault()

View File

@ -363,7 +363,7 @@ func (o DatabaseOptions) SetTransactionCausalReadRisky() error {
return o.setOpt(504, nil)
}
// Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect.
// Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect.
func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
return o.setOpt(505, nil)
}

View File

@ -22,10 +22,6 @@
package fdb
// #define FDB_API_VERSION 710
// #include <foundationdb/fdb_c.h>
import "C"
import (
"fmt"
)

View File

@ -108,3 +108,9 @@ func (s Snapshot) GetRangeSplitPoints(r ExactRange, chunkSize int64) FutureKeyAr
chunkSize,
)
}
// Snapshot returns the receiver and allows Snapshot to satisfy the
// ReadTransaction interface.
func (s Snapshot) Options() TransactionOptions {
return s.Options()
}

View File

@ -41,6 +41,7 @@ type ReadTransaction interface {
Snapshot() Snapshot
GetEstimatedRangeSizeBytes(r ExactRange) FutureInt64
GetRangeSplitPoints(r ExactRange, chunkSize int64) FutureKeyArray
Options() TransactionOptions
ReadTransactor
}

View File

@ -102,7 +102,7 @@ func printTuple(tuple Tuple, sb *strings.Builder) {
fmt.Fprintf(sb, "%v", t)
}
if (i < len(tuple) - 1) {
if i < len(tuple)-1 {
sb.WriteString(", ")
}
}

View File

@ -1580,6 +1580,8 @@ def init(event_model=None):
_network_thread = NetworkThread()
_network_thread.daemon = True
# may not set actual underlying OS thread name
_network_thread.name = "fdb-network-thread"
if event_model is not None:
if event_model == 'gevent':

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import sys
import shutil
import os
import subprocess
import logging
@ -146,6 +147,23 @@ def setclass(logger):
assert 'set_class' in line
# set back to unset
run_fdbcli_command('setclass', random_address, 'unset')
# Attempt to set an invalid address and check error message
output3 = run_fdbcli_command('setclass', '0.0.0.0:4000', 'storage')
logger.debug(output3)
assert 'No matching addresses found' in output3
# Verify setclass did not execute
output4 = run_fdbcli_command('setclass')
logger.debug(output4)
# except the first line, each line is one process
process_types = output4.split('\n')[1:]
assert len(process_types) == args.process_number
addresses = []
for line in process_types:
assert '127.0.0.1' in line
# check class type
assert 'unset' in line
# check class source
assert 'command_line' in line or 'set_class' in line
@enable_logging()
@ -199,6 +217,9 @@ def kill(logger):
@enable_logging()
def suspend(logger):
if not shutil.which("pidof"):
logger.debug("Skipping suspend test. Pidof not available")
return
output1 = run_fdbcli_command('suspend')
lines = output1.split('\n')
assert len(lines) == 2
@ -569,4 +590,5 @@ if __name__ == '__main__':
assert args.process_number > 1, "Process number should be positive"
coordinators()
exclude()
setclass()
# TODO: fix the failure where one process is not available after setclass call
#setclass()

View File

@ -39,9 +39,6 @@ function(configure_testing)
endfunction()
function(verify_testing)
if(NOT ENABLE_SIMULATION_TESTS)
return()
endif()
foreach(test_file IN LISTS fdb_test_files)
message(SEND_ERROR "${test_file} found but it is not associated with a test")
endforeach()
@ -95,6 +92,10 @@ function(add_fdb_test)
if((NOT test_name MATCHES "${TEST_INCLUDE}") OR (test_name MATCHES "${TEST_EXCLUDE}"))
return()
endif()
# We shouldn't run downgrade tests under valgrind: https://github.com/apple/foundationdb/issues/6322
if(USE_VALGRIND AND ${test_name} MATCHES .*to_.*)
return()
endif()
math(EXPR test_idx "${CURRENT_TEST_INDEX} + ${NUM_TEST_FILES}")
set(CURRENT_TEST_INDEX "${test_idx}" PARENT_SCOPE)
# set(<var> <value> PARENT_SCOPE) doesn't set the
@ -404,7 +405,7 @@ endfunction()
# Creates a single cluster before running the specified command (usually a ctest test)
function(add_fdbclient_test)
set(options DISABLED ENABLED)
set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT)
set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY)
set(multiValueArgs COMMAND)
cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
if(OPEN_FOR_IDE)
@ -413,6 +414,9 @@ function(add_fdbclient_test)
if(NOT T_ENABLED AND T_DISABLED)
return()
endif()
if(NOT T_WORKING_DIRECTORY)
set(T_WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
endif()
if(NOT T_NAME)
message(FATAL_ERROR "NAME is a required argument for add_fdbclient_test")
endif()
@ -422,6 +426,7 @@ function(add_fdbclient_test)
message(STATUS "Adding Client test ${T_NAME}")
if (T_PROCESS_NUMBER)
add_test(NAME "${T_NAME}"
WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
--build-dir ${CMAKE_BINARY_DIR}
--process-number ${T_PROCESS_NUMBER}
@ -429,6 +434,7 @@ function(add_fdbclient_test)
${T_COMMAND})
else()
add_test(NAME "${T_NAME}"
WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
--build-dir ${CMAKE_BINARY_DIR}
--

View File

@ -35,7 +35,7 @@ function(compile_boost)
endif()
# Update the user-config.jam
set(BOOST_ADDITIONAL_COMPILE_OPTIOINS "")
set(BOOST_ADDITIONAL_COMPILE_OPTIONS "")
foreach(flag IN LISTS BOOST_COMPILER_FLAGS COMPILE_BOOST_CXXFLAGS)
string(APPEND BOOST_ADDITIONAL_COMPILE_OPTIONS "<cxxflags>${flag} ")
endforeach()
@ -49,8 +49,8 @@ function(compile_boost)
include(ExternalProject)
set(BOOST_INSTALL_DIR "${CMAKE_BINARY_DIR}/boost_install")
ExternalProject_add("${COMPILE_BOOST_TARGET}Project"
URL "https://boostorg.jfrog.io/artifactory/main/release/1.77.0/source/boost_1_77_0.tar.bz2"
URL_HASH SHA256=fc9f85fc030e233142908241af7a846e60630aa7388de9a5fafb1f3a26840854
URL "https://boostorg.jfrog.io/artifactory/main/release/1.78.0/source/boost_1_78_0.tar.bz2"
URL_HASH SHA256=8681f175d4bdb26c52222665793eef08490d7758529330f98d3b29dd0735bccc
CONFIGURE_COMMAND ${BOOTSTRAP_COMMAND} ${BOOTSTRAP_ARGS} --with-libraries=${BOOTSTRAP_LIBRARIES} --with-toolset=${BOOST_TOOLSET}
BUILD_COMMAND ${B2_COMMAND} link=static ${COMPILE_BOOST_BUILD_ARGS} --prefix=${BOOST_INSTALL_DIR} ${USER_CONFIG_FLAG} install
BUILD_IN_SOURCE ON
@ -89,12 +89,12 @@ set(Boost_USE_STATIC_LIBS ON)
# Clang and Gcc will have different name mangling to std::call_once, etc.
if (UNIX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_72_0_clang)
set(BOOST_HINT_PATHS /opt/boost_1_72_0_clang)
list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0_clang)
set(BOOST_HINT_PATHS /opt/boost_1_78_0_clang)
message(STATUS "Using Clang version of boost::context")
else ()
list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_72_0)
set(BOOST_HINT_PATHS /opt/boost_1_72_0)
list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0)
set(BOOST_HINT_PATHS /opt/boost_1_78_0)
message(STATUS "Using g++ version of boost::context")
endif ()
@ -113,7 +113,7 @@ if(WIN32)
return()
endif()
find_package(Boost 1.77.0 EXACT QUIET COMPONENTS context CONFIG PATHS ${BOOST_HINT_PATHS})
find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context CONFIG PATHS ${BOOST_HINT_PATHS})
set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores any installed boost")
if(Boost_FOUND AND NOT FORCE_BOOST_BUILD)

View File

@ -1,6 +1,6 @@
# FindRocksDB
find_package(RocksDB 6.22.1)
find_package(RocksDB 6.27.3)
include(ExternalProject)
@ -22,6 +22,7 @@ if (RocksDB_FOUND)
-DWITH_SNAPPY=OFF
-DWITH_ZLIB=OFF
-DWITH_ZSTD=OFF
-DWITH_LIBURING=${WITH_LIBURING}
-DWITH_TSAN=${USE_TSAN}
-DWITH_ASAN=${USE_ASAN}
-DWITH_UBSAN=${USE_UBSAN}
@ -36,8 +37,8 @@ if (RocksDB_FOUND)
${BINARY_DIR}/librocksdb.a)
else()
ExternalProject_Add(rocksdb
URL https://github.com/facebook/rocksdb/archive/v6.22.1.tar.gz
URL_HASH SHA256=2df8f34a44eda182e22cf84dee7a14f17f55d305ff79c06fb3cd1e5f8831e00d
URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz
URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58
CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB}
-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -52,6 +53,7 @@ else()
-DWITH_SNAPPY=OFF
-DWITH_ZLIB=OFF
-DWITH_ZSTD=OFF
-DWITH_LIBURING=${WITH_LIBURING}
-DWITH_TSAN=${USE_TSAN}
-DWITH_ASAN=${USE_ASAN}
-DWITH_UBSAN=${USE_UBSAN}

View File

@ -22,6 +22,9 @@ use_libcxx(_use_libcxx)
env_set(USE_LIBCXX "${_use_libcxx}" BOOL "Use libc++")
static_link_libcxx(_static_link_libcxx)
env_set(STATIC_LINK_LIBCXX "${_static_link_libcxx}" BOOL "Statically link libstdcpp/libc++")
env_set(TRACE_PC_GUARD_INSTRUMENTATION_LIB "" STRING "Path to a library containing an implementation for __sanitizer_cov_trace_pc_guard. See https://clang.llvm.org/docs/SanitizerCoverage.html for more info.")
env_set(PROFILE_INSTR_GENERATE OFF BOOL "If set, build FDB as an instrumentation build to generate profiles")
env_set(PROFILE_INSTR_USE "" STRING "If set, build FDB with profile")
set(USE_SANITIZER OFF)
if(USE_ASAN OR USE_VALGRIND OR USE_MSAN OR USE_TSAN OR USE_UBSAN)
@ -155,6 +158,10 @@ else()
# we always compile with debug symbols. CPack will strip them out
# and create a debuginfo rpm
add_compile_options(-ggdb -fno-omit-frame-pointer)
if(TRACE_PC_GUARD_INSTRUMENTATION_LIB)
add_compile_options(-fsanitize-coverage=trace-pc-guard)
link_libraries(${TRACE_PC_GUARD_INSTRUMENTATION_LIB})
endif()
if(USE_ASAN)
list(APPEND SANITIZER_COMPILE_OPTIONS
-fsanitize=address
@ -294,6 +301,18 @@ else()
-Wno-register
-Wno-unused-command-line-argument)
endif()
if (PROFILE_INSTR_GENERATE)
add_compile_options(-fprofile-instr-generate)
add_link_options(-fprofile-instr-generate)
endif()
if (NOT (PROFILE_INSTR_USE STREQUAL ""))
if (PROFILE_INSTR_GENERATE)
message(FATAL_ERROR "Can't set both PROFILE_INSTR_GENERATE and PROFILE_INSTR_USE")
endif()
add_compile_options(-Wno-error=profile-instr-out-of-date)
add_compile_options(-fprofile-instr-use=${PROFILE_INSTR_USE})
add_link_options(-fprofile-instr-use=${PROFILE_INSTR_USE})
endif()
endif()
if (USE_WERROR)
add_compile_options(-Werror)

View File

@ -25,19 +25,19 @@ if(DISABLE_TLS)
set(WITH_TLS OFF)
else()
set(OPENSSL_USE_STATIC_LIBS TRUE)
if(WIN32)
set(OPENSSL_MSVC_STATIC_RT ON)
endif()
find_package(OpenSSL)
if(OPENSSL_FOUND)
set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
set(WITH_TLS ON)
add_compile_options(-DHAVE_OPENSSL)
else()
message(STATUS "OpenSSL was not found - Will compile without TLS Support")
message(STATUS "You can set OPENSSL_ROOT_DIR to help cmake find it")
set(WITH_TLS OFF)
endif()
if(WIN32)
message(STATUS "TLS is temporarilty disabled on Windows while libressl -> openssl transition happens")
set(WITH_TLS OFF)
endif()
endif()
################################################################################
@ -162,6 +162,8 @@ endif()
################################################################################
set(SSD_ROCKSDB_EXPERIMENTAL ON CACHE BOOL "Build with experimental RocksDB support")
set(PORTABLE_ROCKSDB ON CACHE BOOL "Compile RocksDB in portable mode") # Set this to OFF to compile RocksDB with `-march=native`
set(WITH_LIBURING OFF CACHE BOOL "Build with liburing enabled") # Set this to ON to include liburing
# RocksDB is currently enabled by default for GCC but does not build with the latest
# Clang.
if (SSD_ROCKSDB_EXPERIMENTAL AND GCC)

26
cmake/Finduring.cmake Normal file
View File

@ -0,0 +1,26 @@
# - Find liburing
#
# uring_INCLUDE_DIR - Where to find liburing.h
# uring_LIBRARIES - List of libraries when using uring.
# uring_FOUND - True if uring found.
find_path(uring_INCLUDE_DIR
NAMES liburing.h)
find_library(uring_LIBRARIES
NAMES liburing.a liburing)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(uring
DEFAULT_MSG uring_LIBRARIES uring_INCLUDE_DIR)
mark_as_advanced(
uring_INCLUDE_DIR
uring_LIBRARIES)
if(uring_FOUND AND NOT TARGET uring::uring)
add_library(uring::uring UNKNOWN IMPORTED)
set_target_properties(uring::uring PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${uring_INCLUDE_DIR}"
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION "${uring_LIBRARIES}")
endif()

View File

@ -1,5 +1,6 @@
set(SRCS
${CMAKE_CURRENT_BINARY_DIR}/Program.cs
${CMAKE_CURRENT_BINARY_DIR}/VersionInfo.cs
${CMAKE_CURRENT_SOURCE_DIR}/Program.cs
Properties/AssemblyInfo.cs)
set(TEST_HARNESS_REFERENCES
@ -7,7 +8,7 @@ set(TEST_HARNESS_REFERENCES
set(out_file ${CMAKE_BINARY_DIR}/packages/bin/TestHarness.exe)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Program.cs.cmake ${CMAKE_CURRENT_BINARY_DIR}/Program.cs)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/VersionInfo.cs.cmake ${CMAKE_CURRENT_BINARY_DIR}/VersionInfo.cs)
add_custom_command(OUTPUT ${out_file}
COMMAND ${MCS_EXECUTABLE} ARGS ${TEST_HARNESS_REFERENCES} ${SRCS} "-target:exe" "-out:${out_file}"

View File

@ -160,7 +160,7 @@ namespace SummarizeTest
}
else if (args[0] == "version")
{
return VersionInfo();
return VersionInfo.Show();
}
return UsageMessage();
@ -1633,16 +1633,6 @@ namespace SummarizeTest
}
}
private static int VersionInfo()
{
Console.WriteLine("Version: 1.02");
Console.WriteLine("FDB Project Ver: " + "${FDB_VERSION}");
Console.WriteLine("FDB Version: " + "${FDB_VERSION_MAJOR}" + "." + "${FDB_VERSION_MINOR}");
Console.WriteLine("Source Version: " + "${CURRENT_GIT_VERSION}");
return 1;
}
private static int UsageMessage()
{
Console.WriteLine("Usage:");
@ -1653,7 +1643,7 @@ namespace SummarizeTest
Console.WriteLine(" TestHarness remote [queue folder] [root foundation folder] [duration in hours] [amount of tests] [all/fast/<test_path>] [scope]");
Console.WriteLine(" TestHarness extract-errors [summary-file] [error-summary-file]");
Console.WriteLine(" TestHarness joshua-run <useValgrind> <maxTries>");
VersionInfo();
VersionInfo.Show();
return 1;
}
}

View File

@ -0,0 +1,15 @@
using System;
namespace SummarizeTest {
static class VersionInfo {
public static int Show()
{
Console.WriteLine("Version: 1.02");
Console.WriteLine("FDB Project Ver: " + "${FDB_VERSION}");
Console.WriteLine("FDB Version: " + "${FDB_VERSION_MAJOR}" + "." + "${FDB_VERSION_MINOR}");
Console.WriteLine("Source Version: " + "${CURRENT_GIT_VERSION}");
return 1;
}
}
}

135
contrib/ctest_to_joshua.py Normal file
View File

@ -0,0 +1,135 @@
from argparse import ArgumentParser
import glob
import io
import json
import os
import platform
import shlex
import subprocess
import tarfile
import tempfile
class JoshuaBuilder:
def __init__(self, build_dir: str, src_dir: str):
self.files = {}
self.build_dir = build_dir
self.src_dir = src_dir
def add_arg(self, arg: str) -> str:
"""Infer files to add to the joshua package from a command line arg to a test"""
if os.path.exists(arg) and arg.endswith(".py"):
dirname = os.path.dirname(arg)
for potential_dep in glob.glob("{}/*.py".format(dirname)):
self._add_arg(potential_dep)
if ".jar:" in arg:
# Assume it's a classpath
return ":".join(self._add_arg(jar) for jar in arg.split(":"))
return self._add_arg(arg)
def _add_arg(self, arg: str) -> str:
if os.path.exists(arg):
if not os.path.relpath(arg, self.build_dir).startswith(".."):
relpath = "build/" + os.path.relpath(arg, self.build_dir)
self.files[arg] = relpath
return relpath
elif not os.path.relpath(arg, self.src_dir).startswith(".."):
relpath = "src/" + os.path.relpath(arg, self.src_dir)
self.files[arg] = relpath
return relpath
elif os.access(arg, os.X_OK):
# Hope it's on the path
name = os.path.basename(arg)
if name.startswith("python3"):
name = "python3"
return name
else:
assert False, "Not sure what to do with {}".format(arg)
return arg
@staticmethod
def _add_file(tar, file, arcfile):
if "bin/" in arcfile or "lib" in arcfile:
print("Stripping debug symbols and adding {} as {}".format(file, arcfile))
with tempfile.NamedTemporaryFile() as tmp:
subprocess.check_output(["strip", "-S", file, "-o", tmp.name])
tar.add(tmp.name, arcfile)
else:
print("Adding {} as {}".format(file, arcfile))
tar.add(file, arcfile)
def write_tarball(self, output, joshua_test):
with tarfile.open(output, "w:gz") as tar:
for file, arcfile in self.files.items():
if not os.path.isdir(file):
self._add_file(tar, file, arcfile)
tarinfo = tarfile.TarInfo("joshua_test")
tarinfo.mode = 0o755
joshua_bytes = joshua_test.encode("utf-8")
tarinfo.size = len(joshua_bytes)
tar.addfile(tarinfo, io.BytesIO(joshua_bytes))
def get_ctest_json(build_dir, extra_args):
return json.loads(
subprocess.check_output(
["ctest", "-N", "--show-only=json-v1"] + extra_args, cwd=build_dir
).decode("utf-8")
)
def main():
parser = ArgumentParser(
description="""
Convert fdb build directory and src directory to a joshua package that runs the ctest tests.
Unknown arguments are forwarded to ctest, so you may use -R to filter tests e.g."""
)
parser.add_argument(
"--build-dir",
metavar="BUILD_DIRECTORY",
help="FDB build directory",
required=True,
)
parser.add_argument(
"--src-dir", metavar="SRC_DIRECTORY", help="FDB source directory", required=True
)
parser.add_argument(
"--output",
metavar="OUTPUT",
help="Where to write the joshua package",
required=True,
)
args, unknown_args = parser.parse_known_args()
ctest_json = get_ctest_json(args.build_dir, unknown_args)
joshua_builder = JoshuaBuilder(args.build_dir, args.src_dir)
commands = []
for test in ctest_json["tests"]:
command = test.get("command")
if command is not None:
commands.append(
" ".join(shlex.quote(joshua_builder.add_arg(arg)) for arg in command)
)
print("Found test: {}".format(commands[-1]))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbbackup"))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbcli"))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbmonitor"))
joshua_builder.add_arg(os.path.join(args.build_dir, "bin/fdbserver"))
if platform.system() == "Darwin":
joshua_builder.add_arg(os.path.join(args.build_dir, "lib/libfdb_c.dylib"))
else:
joshua_builder.add_arg(os.path.join(args.build_dir, "lib/libfdb_c.so"))
joshua_test = '#!/bin/bash\nexport BASH_XTRACEFD=1\nset -euxo pipefail\nexport {library_path}=build/lib:"${library_path}"\n'.format(
library_path="DYLD_LIBRARY_PATH"
if platform.system() == "Darwin"
else "LD_LIBRARY_PATH"
)
joshua_builder.write_tarball(
args.output,
joshua_test + "\n".join(command + " 2>&1" for command in commands),
)
if __name__ == "__main__":
main()

View File

@ -48,8 +48,11 @@ PROTOCOL_VERSION_6_0 = 0x0FDB00A570010001
PROTOCOL_VERSION_6_1 = 0x0FDB00B061060001
PROTOCOL_VERSION_6_2 = 0x0FDB00B062010001
PROTOCOL_VERSION_6_3 = 0x0FDB00B063010001
PROTOCOL_VERSION_7_0 = 0x0FDB00B070010001
PROTOCOL_VERSION_7_1 = 0x0FDB00B071010001
supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_6_0, PROTOCOL_VERSION_6_1,
PROTOCOL_VERSION_6_2, PROTOCOL_VERSION_6_3])
PROTOCOL_VERSION_6_2, PROTOCOL_VERSION_6_3, PROTOCOL_VERSION_7_0,
PROTOCOL_VERSION_7_1])
fdb.api_version(520)
@ -166,6 +169,11 @@ class MutationType(Enum):
MIN = 13
SET_VERSION_STAMPED_KEY = 14
SET_VERSION_STAMPED_VALUE = 15
BYTE_MIN = 16
BYTE_MAX = 17
MIN_V2 = 18
AND_V2 = 19
COMPARE_AND_CLEAR = 20
class Mutation(object):
@ -176,7 +184,11 @@ class Mutation(object):
class BaseInfo(object):
"""
Corresponds to FdbClientLogEvents::Event
"""
def __init__(self, bb, protocol_version):
# we already read the EventType, so go straight to start_timestamp
self.start_timestamp = bb.get_double()
if protocol_version >= PROTOCOL_VERSION_6_3:
self.dc_id = bb.get_bytes_with_length()
@ -281,6 +293,7 @@ class ClientTransactionInfo:
protocol_version = bb.get_long()
if protocol_version not in supported_protocol_versions:
raise UnsupportedProtocolVersionError(protocol_version)
# keep in sync with subclasses of FdbClientLogEvents::Event in fdbclient/ClientLogEvents.h
while bb.get_remaining_bytes():
event = bb.get_int()
if event == 0:

View File

@ -18,7 +18,7 @@ As an essential component of a database system, backup and restore is commonly u
FDB backup system continuously scan the databases key-value space, save key-value pairs and mutations at versions into range files and log files in blob storage. Specifically, mutation logs are generated at CommitProxy, and are written to transaction logs along with regular mutations. In production clusters like CK clusters, backup system is always on, which means each mutation is written twice to transaction logs, consuming about half of write bandwidth and about 40% of CommitProxy CPU time.
The design of old backup system is [here](https://github.com/apple/foundationdb/blob/master/design/backup.md), and the data format of range files and mutations files is [here](https://github.com/apple/foundationdb/blob/master/design/backup-dataFormat.md). The technical overview of FDB is [here](https://github.com/apple/foundationdb/wiki/Technical-Overview-of-the-Database). The FDB recovery is described in this [doc](https://github.com/apple/foundationdb/blob/master/design/recovery-internals.md).
The design of old backup system is [here](https://github.com/apple/foundationdb/blob/main/design/backup.md), and the data format of range files and mutations files is [here](https://github.com/apple/foundationdb/blob/main/design/backup-dataFormat.md). The technical overview of FDB is [here](https://github.com/apple/foundationdb/wiki/Technical-Overview-of-the-Database). The FDB recovery is described in this [doc](https://github.com/apple/foundationdb/blob/main/design/recovery-internals.md).
## Terminology
@ -117,12 +117,12 @@ This project saves the mutation log to blob storage directly from the FDB cluste
**Design question 1**: Should backup workers be recruited as part of log system or not?
There are two design alternatives:
1. Backup worker is external to the log system. In other words, backup workers survive master recovery. Thus, backup workers are recruited and monitored by the cluster controller.
1. The advantage is that the failure of backup workers does not cause master recovery.
2. The disadvantage is that backup workers need to monitor master recovery, especially configuration changes. Because the number of log routers can change after a recovery, we might need to recruit more backup workers for an increase and need to pause/shutdown backup workers for a decrease, which complicates the recruitment logic; or we might need to changing the mapping of tags to backup workers, which is also complex. A further complication is that backup workers need to constantly monitor master recovery and be very careful about the version boundary between two consecutive epochs, because the number of tags may change.
2. Backup worker is recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, i.e., the same number as LogRouters.
1. Backup worker is external to the log system. In other words, backup workers survive cluster recovery. Thus, backup workers are recruited and monitored by the cluster controller.
1. The advantage is that the failure of backup workers does not cause cluster recovery.
2. The disadvantage is that backup workers need to monitor cluster recovery, especially configuration changes. Because the number of log routers can change after a recovery, we might need to recruit more backup workers for an increase and need to pause/shutdown backup workers for a decrease, which complicates the recruitment logic; or we might need to changing the mapping of tags to backup workers, which is also complex. A further complication is that backup workers need to constantly monitor cluster recovery and be very careful about the version boundary between two consecutive epochs, because the number of tags may change.
2. Backup worker is recruited during cluster recovery as part of log system. The Cluster Controller recruits a fixed number of backup workers, i.e., the same number as LogRouters.
1. The advantage is that recruiting and mapping from backup worker to LogRouter tags are simple, i.e., one tag per worker.
2. The disadvantages is that backup workers are tied with master recovery -- a failure of a backup worker results in a master recovery, and a master recovery stops old backup workers and starts new ones.
2. The disadvantages is that backup workers are tied with cluster recovery -- a failure of a backup worker results in a cluster recovery, and a cluster recovery stops old backup workers and starts new ones.
**Decision**: We choose the second approach for the simplicity of the recruiting process and handling of mapping of LogRouter tags to backup workers.
@ -151,7 +151,7 @@ The requirement of the new backup system raises several design challenges:
**Backup Worker**: This is a new role introduced in the new backup system. A backup worker is a `fdbserver` process running inside a FDB cluster, responsible for pulling mutations from transaction logs and saving the mutations to blob storage.
**Master**: The master is responsible for coordinating the transition of the FDB transaction sub-system from one generation to the next. In particular, the master recruits backup workers during the recovery.
**Cluster Controller (CC)**: The CC is responsible for coordinating the transition of the FDB transaction sub-system from one generation to the next. In particular, the CC recruits backup workers during the recovery.
**Transaction Logs (TLogs)**: The transaction logs make mutations durable to disk for fast commit latencies. The logs receive commits from the commit proxy in version order, and only respond to the commit proxy once the data has been written and fsync'ed to an append only mutation log on disk. Storage servers retrieve mutations from TLogs. Once the storage servers have persisted mutations, storage servers then pop the mutations from the TLogs.
@ -176,15 +176,15 @@ Backup worker is a new role introduced in the new backup system. A backup worker
Backup worker has two modes of operation: *no-op* mode, and *working* mode. When there is no active backup in the cluster, backup worker operates in the no-op mode, which simply obtains the recently committed version from Proxies and then pops mutations from transaction logs. After operators submit a new backup request to the cluster, backup workers transition into the working mode that starts pulling mutations from transaction logs and saving the mutation data to blob storage.
In the working mode, the popping of backup workers need to follow a strictly increasing version order. For the same tag, there could be multiple backup workers, each is responsible for a different epoch. These backup workers must coordinating their popping order, otherwise the backup can miss some mutation data. This coordination among backup workers is achieved by deferring popping of a later epoch and only allowing the oldest epoch to pop first. After the oldest epoch has finished, these corresponding backup workers notifies the master, which will then advances the oldest backup epoch so that the next epoch can proceed the popping.
In the working mode, the popping of backup workers need to follow a strictly increasing version order. For the same tag, there could be multiple backup workers, each is responsible for a different epoch. These backup workers must coordinating their popping order, otherwise the backup can miss some mutation data. This coordination among backup workers is achieved by deferring popping of a later epoch and only allowing the oldest epoch to pop first. After the oldest epoch has finished, these corresponding backup workers notifies the CC, which will then advances the oldest backup epoch so that the next epoch can proceed the popping.
A subtle issue for a displaced backup worker (i.e., being displaced because a new epoch begins), is that the last pop of the backup worker can cause missing version ranges in mutation logs. This is because the transaction for saving the progress may be delayed during recovery. As a result, the master could already recruited a new backup worker for the old epoch starting at the previously saved progress version. Then the saving transaction succeeds, and the worker pops mutations that the new backup worker is supposed to save, resulting in missing data for new backup workers log. The solution to this problem can be: 1) the old backup worker aborts immediately after knowing itself is displaced, thus not trying to save its progress; or 2) the old backup worker skip its last pop, since the next epoch will pop versions larger than its progress. Because the second approach avoids doing duplicated work in the new epoch, we choose to the second approach.
A subtle issue for a displaced backup worker (i.e., being displaced because a new epoch begins), is that the last pop of the backup worker can cause missing version ranges in mutation logs. This is because the transaction for saving the progress may be delayed during recovery. As a result, the CC could already recruited a new backup worker for the old epoch starting at the previously saved progress version. Then the saving transaction succeeds, and the worker pops mutations that the new backup worker is supposed to save, resulting in missing data for new backup workers log. The solution to this problem can be: 1) the old backup worker aborts immediately after knowing itself is displaced, thus not trying to save its progress; or 2) the old backup worker skip its last pop, since the next epoch will pop versions larger than its progress. Because the second approach avoids doing duplicated work in the new epoch, we choose to the second approach.
Finally, multiple concurrent backups are supported. Each backup worker keeps track of current backup jobs and saves mutations to corresponding backup containers for the same batch of mutations.
### Recruitment of Backup workers
Backup workers are recruited during master recovery as part of log system. The Master recruits a fixed number of backup workers, one for each log router tag. During the recruiting process, the master sends backup worker initialization request as:
Backup workers are recruited during cluster recovery as part of log system. The CC recruits a fixed number of backup workers, one for each log router tag. During the recruiting process, the CC sends backup worker initialization request as:
```
struct InitializeBackupRequest {
@ -200,11 +200,11 @@ struct InitializeBackupRequest {
```
Note we need two epochs here: one for the recruited epoch and one for backing up epoch. The recruited epoch is the epoch of the log system, which is used by a backup worker to find out if it works for the current epoch. If so, the worker should save its progress and immediately exit. The `backupEpoch` is used for saving progress. The `backupEpoch` is usually the same as the epoch that the worker is recruited. However, it can be some earlier epoch than the recruiting epoch, signifying that the worker is responsible for data in that earlier epoch. In this case, when the worker is done and exits, the master should not flag its departure as a trigger of recovery. This is solved by the following protocol:
Note we need two epochs here: one for the recruited epoch and one for backing up epoch. The recruited epoch is the epoch of the log system, which is used by a backup worker to find out if it works for the current epoch. If so, the worker should save its progress and immediately exit. The `backupEpoch` is used for saving progress. The `backupEpoch` is usually the same as the epoch that the worker is recruited. However, it can be some earlier epoch than the recruiting epoch, signifying that the worker is responsible for data in that earlier epoch. In this case, when the worker is done and exits, the CC should not flag its departure as a trigger of recovery. This is solved by the following protocol:
1. The backup worker finishes its work, including saving progress to the key value store and uploading to cloud storage, and then sends a `BackupWorkerDoneRequest` to the master;
2. The master receives the request, removes the worker from its log system, and updates the oldest backing up epoch `oldestBackupEpoch`;
3. The master sends backup a reply message to the backup worker and registers the new log system with cluster controller;
1. The backup worker finishes its work, including saving progress to the key value store and uploading to cloud storage, and then sends a `BackupWorkerDoneRequest` to the CC;
2. The CC receives the request, removes the worker from its log system, and updates the oldest backing up epoch `oldestBackupEpoch`;
3. The CC sends backup a reply message to the backup worker and registers the new log system with cluster controller;
4. The backup worker exits after receiving the reply. Other backup workers in the system get the new log system from the cluster controller. If a backup workers `backupEpoch` is equal to `oldestBackupEpoch`, then the worker may start popping from TLogs.
Note `oldestBackupEpoch` is introduced to prevent a backup worker for a newer epoch from popping when there are backup workers for older epochs. Otherwise, these older backup workers may lose data.
@ -271,7 +271,7 @@ The backup system must generate log files that the restore system can apply all
**Ordering guarantee**. To maintain the ordering of mutations, each mutation is stored with its commit version and a subsequence number, both are assigned by Proxies during commit. The restore system can load all mutations and derive a total order among all the mutations.
**Completeness guarantee**. All mutations should be saved in log files. We cannot allow any mutations missing from the backup. This is guaranteed by the fault tolerance discussed below. Essentially all backup workers checkpoint their progress in the database. After the recovery, the new master reads previous checkpoints and recruit new backup workers for any missing version ranges.
**Completeness guarantee**. All mutations should be saved in log files. We cannot allow any mutations missing from the backup. This is guaranteed by the fault tolerance discussed below. Essentially all backup workers checkpoint their progress in the database. After the recovery, the new CC reads previous checkpoints and recruit new backup workers for any missing version ranges.
## Backup File Format
@ -306,9 +306,9 @@ The information can be used to optimize the restore process. For instance, the n
## Fault Tolerance
Failures of a backup worker will trigger a master recovery. After the recovery, the new master recruits a new set of backup workers. Among them, a new backup worker shall continue the work of the failed backup worker from the previous epoch.
Failures of a backup worker will trigger a cluster recovery. After the recovery, the new CC recruits a new set of backup workers. Among them, a new backup worker shall continue the work of the failed backup worker from the previous epoch.
The interesting part is the handling of old epochs, since the backup workers for the old epoch are in the “displaced” state and should exit. So the basic idea is that we need a set of backup workers for the data left in the old epochs. To figure out the set of data not backed up yet, the master first loads saved backup progress data `<Worker_UID, LogEpoch, SavedVersion, Tag, TotalTags> `from the database, and then computes for each epoch, what version ranges have not been backed up. For each of the version range and tag, master recruit a worker to resume the backup for that version range and tag. Note that this worker has a different worker UID from the worker in the original epoch. As a result, for a given epoch and a tag, there might be multiple progress status, as these workers are recruited at different epochs.
The interesting part is the handling of old epochs, since the backup workers for the old epoch are in the “displaced” state and should exit. So the basic idea is that we need a set of backup workers for the data left in the old epochs. To figure out the set of data not backed up yet, the CC first loads saved backup progress data `<Worker_UID, LogEpoch, SavedVersion, Tag, TotalTags> `from the database, and then computes for each epoch, what version ranges have not been backed up. For each of the version range and tag, the CC recruits a worker to resume the backup for that version range and tag. Note that this worker has a different worker UID from the worker in the original epoch. As a result, for a given epoch and a tag, there might be multiple progress status, as these workers are recruited at different epochs.
## KPI's and Metrics

View File

@ -71,7 +71,7 @@ This phase locks the coordinated state (cstate) to make sure there is only one m
Recall that `ServerDBInfo` has master's interface and is propagated by CC to every process in a cluster. The current running tLogs can use the master interface in its `ServerDBInfo` to send itself's interface to master.
Master simply waits on receiving the `TLogRejoinRequest` streams: for each tLogs interface received, the master compares the interface ID with the tLog ID read from cstate. Once the master collects enough old tLog interfaces, it will use the interfaces to lock those tLogs.
The logic of collecting tLogs interfaces is implemented in `trackRejoins()` function.
The logic of locking the tLogs is implemented in `epochEnd()` function in [TagPartitionedLogSystems.actor.cpp](https://github.com/apple/foundationdb/blob/master/fdbserver/TagPartitionedLogSystem.actor.cpp).
The logic of locking the tLogs is implemented in `epochEnd()` function in [TagPartitionedLogSystems.actor.cpp](https://github.com/apple/foundationdb/blob/main/fdbserver/TagPartitionedLogSystem.actor.cpp).
Once we lock the cstate, we bump the `recoveryCount` by 1 and write the `recoveryCount` to the cstate. Each tLog in a recovery attempt records the `recoveryCount` and monitors the change of the variable. If the `recoveryCount` increases, becoming larger than the recorded value, the tLog will terminate itself. This mechanism makes sure that when multiple recovery attempts happen concurrently, only tLogs in the most recent recovery will be running. tLogs in other recovery attempts can release their memory earlier, reducing the memory pressure during recovery. This is an important memory optimization before shared tLogs, which allows tLogs in different generations to share the same memory, is introduced.
@ -118,7 +118,7 @@ Consider an old generation with three TLogs: `A, B, C`. Their durable versions a
* Situation 2: A tLog may die after it reports alive to the CC in the RECRUITING phase. This may cause the `knownCommittedVersion` calculated by the CC in this phase to no longer be valid in the next phases. When this happens, the CC will detect it, terminate the current recovery, and start a new recovery.
Once we have a `knownCommittedVersion`, the CC will reconstruct the [transaction state store](https://github.com/apple/foundationdb/blob/master/design/transaction-state-store.md) by peeking the txnStateTag in oldLogSystem.
Once we have a `knownCommittedVersion`, the CC will reconstruct the [transaction state store](https://github.com/apple/foundationdb/blob/main/design/transaction-state-store.md) by peeking the txnStateTag in oldLogSystem.
Recall that the txnStateStore includes the transaction systems configuration, such as the assignment of shards to SS and to tLogs and that the txnStateStore was durable on disk in the oldLogSystem.
Once we get the txnStateStore, we know the configuration of the transaction system, such as the number of GRV proxies and commit proxies. The CC recruits roles for the new generation in the `recruitEverything()` function. Those recruited roles includes GRV proxies, commit proxies, tLogs and seed SSes, which are the storage servers created for an empty database in the first generation to host the first shard and serve as the starting point of the bootstrap process to recruit more SSes. Once all roles are recruited, the CC starts a new epoch in `newEpoch()`.

View File

@ -137,7 +137,7 @@ storage server is responsible for which data range, and ensures data is
evenly distributed across all storage servers (SS). Data distributor as
a singleton in the cluster is recruited and monitored by Cluster
Controller. See `internal
documentation <https://github.com/apple/foundationdb/blob/master/design/data-distributor-internals.md>`__.
documentation <https://github.com/apple/foundationdb/blob/main/design/data-distributor-internals.md>`__.
Ratekeeper
~~~~~~~~~~
@ -313,7 +313,7 @@ Transaction Logs. After previous Log Servers are stopped and new transaction
system is recruited, the Master writes the coordinated states with current
transaction system information. Finally, the Master accepts new
transaction commits. See details in this
`documentation <https://github.com/apple/foundationdb/blob/master/design/recovery-internals.md>`__.
`documentation <https://github.com/apple/foundationdb/blob/main/design/recovery-internals.md>`__.
Because GRV Proxies, Commit Proxies and Resolvers are stateless, their
recoveries have no extra work. In contrast, Transaction Logs save the
@ -345,16 +345,16 @@ Resources
Post <https://forums.foundationdb.org/t/technical-overview-of-the-database/135/26>`__
`Existing Architecture
Documentation <https://github.com/apple/foundationdb/blob/master/documentation/sphinx/source/kv-architecture.rst>`__
Documentation <https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/kv-architecture.rst>`__
`Summit
Presentation <https://www.youtube.com/watch?list=PLbzoR-pLrL6q7uYN-94-p_-Q3hyAmpI7o&v=EMwhsGsxfPU&feature=emb_logo>`__
`Data Distribution
Documentation <https://github.com/apple/foundationdb/blob/master/design/data-distributor-internals.md>`__
Documentation <https://github.com/apple/foundationdb/blob/main/design/data-distributor-internals.md>`__
`Recovery
Documentation <https://github.com/apple/foundationdb/blob/master/design/recovery-internals.md>`__
Documentation <https://github.com/apple/foundationdb/blob/main/design/recovery-internals.md>`__
.. |image0| image:: images/Architecture.png
.. |image1| image:: images/architecture-1.jpeg

View File

@ -83,10 +83,11 @@ For blob store backup locations, the Backup URL format is
::
blobstore://<api_key>[:<secret>]@<hostname>[:<port>]/<name>?bucket=<bucket_name>[&<param>=<value>]...]
blobstore://[<api_key>][:<secret>[:<security_token>]]@<hostname>[:<port>]/<name>?bucket=<bucket_name>[&<param>=<value>]...]
<api_key> - API key to use for authentication
<api_key> - API key to use for authentication. Optional.
<secret> - API key's secret. Optional.
<security_token> - Security token if temporary credentials are used. Optional.
<hostname> - Remote hostname or IP address to connect to
<port> - Remote port to connect to. Optional. Default is 80.
<name> - Name of the backup within the backup bucket. It can contain '/' characters in order to organize backups into a folder-like structure.
@ -177,6 +178,17 @@ The Blob Credential File format is JSON with the following schema:
}
}
If temporary credentials are being used, the following schema is also supported
::
{
"accounts" : {
"@host" : { "api_key" : user, "secret" : "SECRETKEY", token: "TOKEN1" },
"@host2" : { "api_key" : user2, "secret" : "SECRETKEY2", token: "TOKEN2" }
}
}
TLS Support
===========
@ -405,6 +417,16 @@ The ``list`` subcommand will list the backups at a given 'base' or shortened Bac
``-b <BASE_URL>`` or ``--base-url <BASE_URL>``
This a shortened Backup URL which looks just like a Backup URL but without the backup <name> so that the list command will discover and list all of the backups in the bucket.
.. program:: fdbbackup list
``tags``
----------
The ``tags`` subcommand will list the tags of all backups on a source cluster.
::
user@host$ fdbbackup tags [-C <CLUSTER_FILE>]
.. program:: fdbbackup cleanup

View File

@ -25,7 +25,7 @@ Features
* Added TLS support to fdbdecode for decoding mutation log files stored in blobs. `(PR #4611) <https://github.com/apple/foundationdb/pull/4611>`_
* Added ``initial_snapshot_interval`` to fdbbackup that can specify the duration of the first inconsistent snapshot written to the backup. `(PR #4620) <https://github.com/apple/foundationdb/pull/4620>`_
* Added ``inconsistent_snapshot_only`` to fdbbackup that ignores mutation log files and only uses range files during the restore to speedup the process. `(PR #4704) <https://github.com/apple/foundationdb/pull/4704>`_
* Added the Testing Storage Server (TSS), which allows FoundationDB to run an "untrusted" storage engine with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance. `(Documentation) <https://github.com/apple/foundationdb/blob/master/documentation/sphinx/source/tss.rst>`_ `(PR #4556) <https://github.com/apple/foundationdb/pull/4556>`_
* Added the Testing Storage Server (TSS), which allows FoundationDB to run an "untrusted" storage engine with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance. `(Documentation) <https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/tss.rst>`_ `(PR #4556) <https://github.com/apple/foundationdb/pull/4556>`_
* Added perpetual storage wiggle that supports less impactful B-trees recreation and data migration. These will also be used for deploying the Testing Storage Server which compares 2 storage engines' results. See :ref:`Documentation <perpetual-storage-wiggle>` for details. `(PR #4838) <https://github.com/apple/foundationdb/pull/4838>`_
* Improved the efficiency with which storage servers replicate data between themselves. `(PR #5017) <https://github.com/apple/foundationdb/pull/5017>`_
* Added support to ``exclude command`` to exclude based on locality match. `(PR #5113) <https://github.com/apple/foundationdb/pull/5113>`_

View File

@ -18,7 +18,7 @@ The second part of transaction profiling involves deleting old sampled data to r
There are many ways that this data can be exposed for analysis. One can imagine building a client that reads the data from the database and streams it to external tools such as Wavefront.
One such tool thats available as part of open source FDB is a python script called ``transaction_profiling_analyzer.py`` that's available here on `GitHUb <https://github.com/apple/foundationdb/blob/master/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py>`_. It reads the sampled data from the database and outputs it in a user friendly format. Currently its most useful in identifying hot key-ranges (for both reading and writing).
One such tool thats available as part of open source FDB is a python script called ``transaction_profiling_analyzer.py`` that's available here on `GitHUb <https://github.com/apple/foundationdb/blob/main/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py>`_. It reads the sampled data from the database and outputs it in a user friendly format. Currently its most useful in identifying hot key-ranges (for both reading and writing).
Prerequisites
=============

View File

@ -95,7 +95,8 @@ enum class BackupType {
LIST,
QUERY,
DUMP,
CLEANUP
CLEANUP,
TAGS,
};
enum class DBType { UNDEFINED = 0, START, STATUS, SWITCH, ABORT, PAUSE, RESUME };
@ -599,6 +600,24 @@ CSimpleOpt::SOption g_rgBackupDumpOptions[] = {
SO_END_OF_OPTIONS
};
CSimpleOpt::SOption g_rgBackupTagsOptions[] = {
#ifdef _WIN32
{ OPT_PARENTPID, "--parentpid", SO_REQ_SEP },
#endif
{ OPT_CLUSTERFILE, "-C", SO_REQ_SEP },
{ OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--logdir", SO_REQ_SEP },
{ OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP },
{ OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
{ OPT_QUIET, "-q", SO_NONE },
{ OPT_QUIET, "--quiet", SO_NONE },
#ifndef TLS_DISABLED
TLS_OPTION_FLAGS
#endif
SO_END_OF_OPTIONS
};
CSimpleOpt::SOption g_rgBackupListOptions[] = {
#ifdef _WIN32
{ OPT_PARENTPID, "--parentpid", SO_REQ_SEP },
@ -998,7 +1017,7 @@ void printBackupContainerInfo() {
static void printBackupUsage(bool devhelp) {
printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n");
printf("Usage: %s [TOP_LEVEL_OPTIONS] (start | status | abort | wait | discontinue | pause | resume | expire | "
"delete | describe | list | query | cleanup) [ACTION_OPTIONS]\n\n",
"delete | describe | list | query | cleanup | tags) [ACTION_OPTIONS]\n\n",
exeBackup.toString().c_str());
printf(" TOP LEVEL OPTIONS:\n");
printf(" --build-flags Print build information and exit.\n");
@ -1424,6 +1443,7 @@ BackupType getBackupType(std::string backupType) {
values["query"] = BackupType::QUERY;
values["dump"] = BackupType::DUMP;
values["modify"] = BackupType::MODIFY;
values["tags"] = BackupType::TAGS;
}
auto i = values.find(backupType);
@ -2812,6 +2832,23 @@ ACTOR Future<Void> listBackup(std::string baseUrl) {
return Void();
}
ACTOR Future<Void> listBackupTags(Database cx) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
std::vector<KeyBackedTag> tags = wait(getAllBackupTags(tr));
for (const auto& tag : tags) {
printf("%s\n", tag.tagName.c_str());
}
return Void();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
struct BackupModifyOptions {
Optional<std::string> verifyUID;
Optional<std::string> destURL;
@ -3214,6 +3251,10 @@ int main(int argc, char* argv[]) {
args = std::make_unique<CSimpleOpt>(
argc - 1, &argv[1], g_rgBackupModifyOptions, SO_O_EXACT | SO_O_HYPHEN_TO_UNDERSCORE);
break;
case BackupType::TAGS:
args = std::make_unique<CSimpleOpt>(
argc - 1, &argv[1], g_rgBackupTagsOptions, SO_O_EXACT | SO_O_HYPHEN_TO_UNDERSCORE);
break;
case BackupType::UNDEFINED:
default:
args =
@ -4030,6 +4071,12 @@ int main(int argc, char* argv[]) {
f = stopAfter(listBackup(baseUrl));
break;
case BackupType::TAGS:
if (!initCluster())
return FDB_EXIT_ERROR;
f = stopAfter(listBackupTags(db));
break;
case BackupType::QUERY:
initTraceFile();
f = stopAfter(queryBackup(argv[0],

View File

@ -48,10 +48,10 @@ ACTOR Future<Void> changeFeedList(Database db) {
printf("Found %d range feeds%s\n", result.size(), result.size() == 0 ? "." : ":");
for (auto& it : result) {
auto range = std::get<0>(decodeChangeFeedValue(it.value));
printf(" %s: %s - %s\n",
printf(" %s: `%s' - `%s'\n",
it.key.removePrefix(changeFeedPrefix).toString().c_str(),
range.begin.toString().c_str(),
range.end.toString().c_str());
printable(range.begin).c_str(),
printable(range.end).c_str());
}
return Void();
} catch (Error& e) {

View File

@ -33,7 +33,7 @@
namespace {
// Exclue the given servers and localities
// Exclude the given servers and localities
ACTOR Future<bool> excludeServersAndLocalities(Reference<IDatabase> db,
std::vector<AddressExclusion> servers,
std::unordered_set<std::string> localities,
@ -381,23 +381,29 @@ ACTOR Future<bool> excludeCommandActor(Reference<IDatabase> db, std::vector<Stri
CommandFactory excludeFactory(
"exclude",
CommandHelp(
"exclude [FORCE] [failed] [no_wait] [<ADDRESS...>] [locality_dcid:<excludedcid>] "
"[locality_zoneid:<excludezoneid>] [locality_machineid:<excludemachineid>] "
"[locality_processid:<excludeprocessid>] or any locality data",
"exclude servers from the database either with IP address match or locality match",
"If no addresses or locaities are specified, lists the set of excluded addresses and localities."
"\n\nFor each IP address or IP:port pair in <ADDRESS...> or any LocalityData attributes (like dcid, "
"zoneid, "
"machineid, processid), adds the address/locality to the set of excluded servers and localities then waits "
"until all database state has been safely moved away from the specified servers. If 'no_wait' is set, the "
"command returns \nimmediately without checking if the exclusions have completed successfully.\n"
"exclude [FORCE] [failed] [no_wait] [<ADDRESS...>] [locality_dcid:<excludedcid>]\n"
" [locality_zoneid:<excludezoneid>] [locality_machineid:<excludemachineid>]\n"
" [locality_processid:<excludeprocessid>] [locality_<KEY>:<localtyvalue>]",
"exclude servers from the database by IP address or locality",
"If no addresses or localities are specified, lists the set of excluded addresses and localities.\n"
"\n"
"For each IP address or IP:port pair in <ADDRESS...> and/or each locality attribute (like dcid, "
"zoneid, machineid, processid), adds the address/locality to the set of exclusions and waits until all "
"database state has been safely moved away from affected servers.\n"
"\n"
"If 'FORCE' is set, the command does not perform safety checks before excluding.\n"
"If 'failed' is set, the transaction log queue is dropped pre-emptively before waiting\n"
"for data movement to finish and the server cannot be included again."
"\n\nWARNING of potential dataloss\n:"
"If a to-be-excluded server is the last server of some team(s), and 'failed' is set, the data in the team(s) "
"will be lost. 'failed' should be set only if the server(s) have permanently failed."
"In the case all servers of a team have failed permanently and dataloss has been a fact, excluding all the "
"servers will clean up the corresponding keyrange, and fix the invalid metadata. The keyrange will be "
"assigned to a new team as an empty shard."));
"\n"
"If 'no_wait' is set, the command returns immediately without checking if the exclusions have completed "
"successfully.\n"
"\n"
"If 'failed' is set, the cluster will immediately forget all data associated with the excluded processes. "
"Doing so can be helpful if the process is not expected to recover, as it will allow the cluster to delete "
"state that would be needed to catch the failed process up. Re-including a process excluded with 'failed' will "
"result in it joining as an empty process.\n"
"\n"
"If a cluster has failed storage servers that result in all replicas of some data being permanently gone, "
"'exclude failed' can be used to clean up the affected key ranges by restoring them to empty.\n"
"\n"
"WARNING: use of 'exclude failed' can result in data loss. If an excluded server contains the last replica of "
"some data, then using the 'failed' option will permanently remove that data from the cluster."));
} // namespace fdb_cli

View File

@ -123,7 +123,7 @@ LineNoise::LineNoise(std::function<void(std::string const&, std::vector<std::str
linenoiseSetFreeHintsCallback(free);
#endif
threadPool->addThread(reader);
threadPool->addThread(reader, "fdb-linenoise");
}
LineNoise::~LineNoise() {

View File

@ -77,6 +77,13 @@ ACTOR Future<bool> setProcessClass(Reference<IDatabase> db, KeyRef network_addre
loop {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
try {
state ThreadFuture<Optional<Value>> result =
tr->get(network_address.withPrefix(fdb_cli::processClassTypeSpecialKeyRange.begin));
Optional<Value> val = wait(safeThreadFutureToFuture(result));
if (!val.present()) {
printf("No matching addresses found\n");
return false;
}
tr->set(network_address.withPrefix(fdb_cli::processClassTypeSpecialKeyRange.begin), class_type);
wait(safeThreadFutureToFuture(tr->commit()));
return true;

View File

@ -1177,7 +1177,7 @@ void printStatus(StatusObjectReader statusObj,
}
}
// "db" is the handler to the multiversion databse
// "db" is the handler to the multiversion database
// localDb is the native Database object
// localDb is rarely needed except the "db" has not establised a connection to the cluster where the operation will
// return Never as we expect status command to always return, we use "localDb" to return the default result

View File

@ -85,6 +85,7 @@ enum {
OPT_HELP,
OPT_TRACE,
OPT_TRACE_DIR,
OPT_LOGGROUP,
OPT_TIMEOUT,
OPT_EXEC,
OPT_NO_STATUS,
@ -103,6 +104,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP },
{ OPT_DATABASE, "-d", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
{ OPT_LOGGROUP, "--log-group", SO_REQ_SEP },
{ OPT_TIMEOUT, "--timeout", SO_REQ_SEP },
{ OPT_EXEC, "--exec", SO_REQ_SEP },
{ OPT_NO_STATUS, "--no-status", SO_NONE },
@ -125,7 +127,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP },
SO_END_OF_OPTIONS };
void printAtCol(const char* text, int col) {
void printAtCol(const char* text, int col, FILE* stream = stdout) {
const char* iter = text;
const char* start = text;
const char* space = nullptr;
@ -137,7 +139,7 @@ void printAtCol(const char* text, int col) {
if (*iter == '\n' || *iter == '\0' || (iter - start == col)) {
if (!space)
space = iter;
printf("%.*s\n", (int)(space - start), start);
fprintf(stream, "%.*s\n", (int)(space - start), start);
start = space;
if (*start == ' ' || *start == '\n')
start++;
@ -427,6 +429,9 @@ static void printProgramUsage(const char* name) {
" --log-dir PATH Specifes the output directory for trace files. If\n"
" unspecified, defaults to the current directory. Has\n"
" no effect unless --log is specified.\n"
" --log-group LOG_GROUP\n"
" Sets the LogGroup field with the specified value for all\n"
" events in the trace output (defaults to `default').\n"
" --trace-format FORMAT\n"
" Select the format of the log files. xml (the default) and json\n"
" are supported. Has no effect unless --log is specified.\n"
@ -639,422 +644,6 @@ ACTOR Future<Void> commitTransaction(Reference<ITransaction> tr) {
return Void();
}
// FIXME: Factor address parsing from coordinators, include, exclude
ACTOR Future<bool> coordinators(Database db, std::vector<StringRef> tokens, bool isClusterTLS) {
state StringRef setName;
StringRef nameTokenBegin = LiteralStringRef("description=");
for (auto tok = tokens.begin() + 1; tok != tokens.end(); ++tok)
if (tok->startsWith(nameTokenBegin)) {
setName = tok->substr(nameTokenBegin.size());
std::copy(tok + 1, tokens.end(), tok);
tokens.resize(tokens.size() - 1);
break;
}
bool automatic = tokens.size() == 2 && tokens[1] == LiteralStringRef("auto");
state Reference<IQuorumChange> change;
if (tokens.size() == 1 && setName.size()) {
change = noQuorumChange();
} else if (automatic) {
// Automatic quorum change
change = autoQuorumChange();
} else {
state std::set<NetworkAddress> addresses;
state std::vector<StringRef>::iterator t;
for (t = tokens.begin() + 1; t != tokens.end(); ++t) {
try {
// SOMEDAY: Check for keywords
auto const& addr = NetworkAddress::parse(t->toString());
if (addresses.count(addr)) {
fprintf(stderr, "ERROR: passed redundant coordinators: `%s'\n", addr.toString().c_str());
return true;
}
addresses.insert(addr);
} catch (Error& e) {
if (e.code() == error_code_connection_string_invalid) {
fprintf(stderr, "ERROR: '%s' is not a valid network endpoint address\n", t->toString().c_str());
return true;
}
throw;
}
}
std::vector<NetworkAddress> addressesVec(addresses.begin(), addresses.end());
change = specifiedQuorumChange(addressesVec);
}
if (setName.size())
change = nameQuorumChange(setName.toString(), change);
CoordinatorsResult r = wait(makeInterruptable(changeQuorum(db, change)));
// Real errors get thrown from makeInterruptable and printed by the catch block in cli(), but
// there are various results specific to changeConfig() that we need to report:
bool err = true;
switch (r) {
case CoordinatorsResult::INVALID_NETWORK_ADDRESSES:
fprintf(stderr, "ERROR: The specified network addresses are invalid\n");
break;
case CoordinatorsResult::SAME_NETWORK_ADDRESSES:
printf("No change (existing configuration satisfies request)\n");
err = false;
break;
case CoordinatorsResult::NOT_COORDINATORS:
fprintf(stderr, "ERROR: Coordination servers are not running on the specified network addresses\n");
break;
case CoordinatorsResult::DATABASE_UNREACHABLE:
fprintf(stderr, "ERROR: Database unreachable\n");
break;
case CoordinatorsResult::BAD_DATABASE_STATE:
fprintf(stderr,
"ERROR: The database is in an unexpected state from which changing coordinators might be unsafe\n");
break;
case CoordinatorsResult::COORDINATOR_UNREACHABLE:
fprintf(stderr, "ERROR: One of the specified coordinators is unreachable\n");
break;
case CoordinatorsResult::SUCCESS:
printf("Coordination state changed\n");
err = false;
break;
case CoordinatorsResult::NOT_ENOUGH_MACHINES:
fprintf(stderr, "ERROR: Too few fdbserver machines to provide coordination at the current redundancy level\n");
break;
default:
ASSERT(false);
};
return err;
}
// Includes the servers that could be IP addresses or localities back to the cluster.
ACTOR Future<bool> include(Database db, std::vector<StringRef> tokens) {
std::vector<AddressExclusion> addresses;
state std::vector<std::string> localities;
state bool failed = false;
state bool all = false;
for (auto t = tokens.begin() + 1; t != tokens.end(); ++t) {
if (*t == LiteralStringRef("all")) {
all = true;
} else if (*t == LiteralStringRef("failed")) {
failed = true;
} else if (t->startsWith(LocalityData::ExcludeLocalityPrefix) && t->toString().find(':') != std::string::npos) {
// if the token starts with 'locality_' prefix.
localities.push_back(t->toString());
} else {
auto a = AddressExclusion::parse(*t);
if (!a.isValid()) {
fprintf(stderr,
"ERROR: '%s' is neither a valid network endpoint address nor a locality\n",
t->toString().c_str());
if (t->toString().find(":tls") != std::string::npos)
printf(" Do not include the `:tls' suffix when naming a process\n");
return true;
}
addresses.push_back(a);
}
}
if (all) {
std::vector<AddressExclusion> includeAll;
includeAll.push_back(AddressExclusion());
wait(makeInterruptable(includeServers(db, includeAll, failed)));
wait(makeInterruptable(includeLocalities(db, localities, failed, all)));
} else {
if (!addresses.empty()) {
wait(makeInterruptable(includeServers(db, addresses, failed)));
}
if (!localities.empty()) {
// includes the servers that belong to given localities.
wait(makeInterruptable(includeLocalities(db, localities, failed, all)));
}
}
return false;
};
ACTOR Future<bool> exclude(Database db,
std::vector<StringRef> tokens,
Reference<ClusterConnectionFile> ccf,
Future<Void> warn) {
if (tokens.size() <= 1) {
state Future<std::vector<AddressExclusion>> fexclAddresses = makeInterruptable(getExcludedServers(db));
state Future<std::vector<std::string>> fexclLocalities = makeInterruptable(getExcludedLocalities(db));
wait(success(fexclAddresses) && success(fexclLocalities));
std::vector<AddressExclusion> exclAddresses = fexclAddresses.get();
std::vector<std::string> exclLocalities = fexclLocalities.get();
if (!exclAddresses.size() && !exclLocalities.size()) {
printf("There are currently no servers or localities excluded from the database.\n"
"To learn how to exclude a server, type `help exclude'.\n");
return false;
}
printf("There are currently %zu servers or localities being excluded from the database:\n",
exclAddresses.size() + exclLocalities.size());
for (const auto& e : exclAddresses)
printf(" %s\n", e.toString().c_str());
for (const auto& e : exclLocalities)
printf(" %s\n", e.c_str());
printf("To find out whether it is safe to remove one or more of these\n"
"servers from the cluster, type `exclude <addresses>'.\n"
"To return one of these servers to the cluster, type `include <addresses>'.\n");
return false;
} else {
state std::vector<AddressExclusion> exclusionVector;
state std::set<AddressExclusion> exclusionSet;
state std::vector<AddressExclusion> exclusionAddresses;
state std::unordered_set<std::string> exclusionLocalities;
state std::vector<std::string> noMatchLocalities;
state bool force = false;
state bool waitForAllExcluded = true;
state bool markFailed = false;
state std::vector<ProcessData> workers = wait(makeInterruptable(getWorkers(db)));
for (auto t = tokens.begin() + 1; t != tokens.end(); ++t) {
if (*t == LiteralStringRef("FORCE")) {
force = true;
} else if (*t == LiteralStringRef("no_wait")) {
waitForAllExcluded = false;
} else if (*t == LiteralStringRef("failed")) {
markFailed = true;
} else if (t->startsWith(LocalityData::ExcludeLocalityPrefix) &&
t->toString().find(':') != std::string::npos) {
std::set<AddressExclusion> localityAddresses = getAddressesByLocality(workers, t->toString());
if (localityAddresses.empty()) {
noMatchLocalities.push_back(t->toString());
} else {
// add all the server ipaddresses that belong to the given localities to the exclusionSet.
exclusionVector.insert(exclusionVector.end(), localityAddresses.begin(), localityAddresses.end());
exclusionSet.insert(localityAddresses.begin(), localityAddresses.end());
}
exclusionLocalities.insert(t->toString());
} else {
auto a = AddressExclusion::parse(*t);
if (!a.isValid()) {
fprintf(stderr,
"ERROR: '%s' is neither a valid network endpoint address nor a locality\n",
t->toString().c_str());
if (t->toString().find(":tls") != std::string::npos)
printf(" Do not include the `:tls' suffix when naming a process\n");
return true;
}
exclusionVector.push_back(a);
exclusionSet.insert(a);
exclusionAddresses.push_back(a);
}
}
if (exclusionAddresses.empty() && exclusionLocalities.empty()) {
fprintf(stderr, "ERROR: At least one valid network endpoint address or a locality is not provided\n");
return true;
}
if (!force) {
if (markFailed) {
state bool safe;
try {
bool _safe = wait(makeInterruptable(checkSafeExclusions(db, exclusionVector)));
safe = _safe;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
TraceEvent("CheckSafeExclusionsError").error(e);
safe = false;
}
if (!safe) {
std::string errorStr =
"ERROR: It is unsafe to exclude the specified servers at this time.\n"
"Please check that this exclusion does not bring down an entire storage team.\n"
"Please also ensure that the exclusion will keep a majority of coordinators alive.\n"
"You may add more storage processes or coordinators to make the operation safe.\n"
"Type `exclude FORCE failed <ADDRESS...>' to exclude without performing safety checks.\n";
printf("%s", errorStr.c_str());
return true;
}
}
StatusObject status = wait(makeInterruptable(StatusClient::statusFetcher(db)));
state std::string errorString =
"ERROR: Could not calculate the impact of this exclude on the total free space in the cluster.\n"
"Please try the exclude again in 30 seconds.\n"
"Type `exclude FORCE <ADDRESS...>' to exclude without checking free space.\n";
StatusObjectReader statusObj(status);
StatusObjectReader statusObjCluster;
if (!statusObj.get("cluster", statusObjCluster)) {
fprintf(stderr, "%s", errorString.c_str());
return true;
}
StatusObjectReader processesMap;
if (!statusObjCluster.get("processes", processesMap)) {
fprintf(stderr, "%s", errorString.c_str());
return true;
}
state int ssTotalCount = 0;
state int ssExcludedCount = 0;
state double worstFreeSpaceRatio = 1.0;
try {
for (auto proc : processesMap.obj()) {
bool storageServer = false;
StatusArray rolesArray = proc.second.get_obj()["roles"].get_array();
for (StatusObjectReader role : rolesArray) {
if (role["role"].get_str() == "storage") {
storageServer = true;
break;
}
}
// Skip non-storage servers in free space calculation
if (!storageServer)
continue;
StatusObjectReader process(proc.second);
std::string addrStr;
if (!process.get("address", addrStr)) {
fprintf(stderr, "%s", errorString.c_str());
return true;
}
NetworkAddress addr = NetworkAddress::parse(addrStr);
bool excluded =
(process.has("excluded") && process.last().get_bool()) || addressExcluded(exclusionSet, addr);
ssTotalCount++;
if (excluded)
ssExcludedCount++;
if (!excluded) {
StatusObjectReader disk;
if (!process.get("disk", disk)) {
fprintf(stderr, "%s", errorString.c_str());
return true;
}
int64_t total_bytes;
if (!disk.get("total_bytes", total_bytes)) {
fprintf(stderr, "%s", errorString.c_str());
return true;
}
int64_t free_bytes;
if (!disk.get("free_bytes", free_bytes)) {
fprintf(stderr, "%s", errorString.c_str());
return true;
}
worstFreeSpaceRatio = std::min(worstFreeSpaceRatio, double(free_bytes) / total_bytes);
}
}
} catch (...) // std::exception
{
fprintf(stderr, "%s", errorString.c_str());
return true;
}
if (ssExcludedCount == ssTotalCount ||
(1 - worstFreeSpaceRatio) * ssTotalCount / (ssTotalCount - ssExcludedCount) > 0.9) {
fprintf(stderr,
"ERROR: This exclude may cause the total free space in the cluster to drop below 10%%.\n"
"Type `exclude FORCE <ADDRESS...>' to exclude without checking free space.\n");
return true;
}
}
if (!exclusionAddresses.empty()) {
wait(makeInterruptable(excludeServers(db, exclusionAddresses, markFailed)));
}
if (!exclusionLocalities.empty()) {
wait(makeInterruptable(excludeLocalities(db, exclusionLocalities, markFailed)));
}
if (waitForAllExcluded) {
printf("Waiting for state to be removed from all excluded servers. This may take a while.\n");
printf("(Interrupting this wait with CTRL+C will not cancel the data movement.)\n");
}
if (warn.isValid())
warn.cancel();
state std::set<NetworkAddress> notExcludedServers =
wait(makeInterruptable(checkForExcludingServers(db, exclusionVector, waitForAllExcluded)));
std::map<IPAddress, std::set<uint16_t>> workerPorts;
for (auto addr : workers)
workerPorts[addr.address.ip].insert(addr.address.port);
// Print a list of all excluded addresses that don't have a corresponding worker
std::set<AddressExclusion> absentExclusions;
for (const auto& addr : exclusionVector) {
auto worker = workerPorts.find(addr.ip);
if (worker == workerPorts.end())
absentExclusions.insert(addr);
else if (addr.port > 0 && worker->second.count(addr.port) == 0)
absentExclusions.insert(addr);
}
for (const auto& exclusion : exclusionVector) {
if (absentExclusions.find(exclusion) != absentExclusions.end()) {
if (exclusion.port == 0) {
fprintf(stderr,
" %s(Whole machine) ---- WARNING: Missing from cluster!Be sure that you excluded the "
"correct machines before removing them from the cluster!\n",
exclusion.ip.toString().c_str());
} else {
fprintf(stderr,
" %s ---- WARNING: Missing from cluster! Be sure that you excluded the correct processes "
"before removing them from the cluster!\n",
exclusion.toString().c_str());
}
} else if (std::any_of(notExcludedServers.begin(), notExcludedServers.end(), [&](const NetworkAddress& a) {
return addressExcluded({ exclusion }, a);
})) {
if (exclusion.port == 0) {
fprintf(stderr,
" %s(Whole machine) ---- WARNING: Exclusion in progress! It is not safe to remove this "
"machine from the cluster\n",
exclusion.ip.toString().c_str());
} else {
fprintf(stderr,
" %s ---- WARNING: Exclusion in progress! It is not safe to remove this process from the "
"cluster\n",
exclusion.toString().c_str());
}
} else {
if (exclusion.port == 0) {
printf(" %s(Whole machine) ---- Successfully excluded. It is now safe to remove this machine "
"from the cluster.\n",
exclusion.ip.toString().c_str());
} else {
printf(
" %s ---- Successfully excluded. It is now safe to remove this process from the cluster.\n",
exclusion.toString().c_str());
}
}
}
for (const auto& locality : noMatchLocalities) {
fprintf(
stderr,
" %s ---- WARNING: Currently no servers found with this locality match! Be sure that you excluded "
"the correct locality.\n",
locality.c_str());
}
ClusterConnectionString ccs = wait(ccf->getStoredConnectionString());
bool foundCoordinator = false;
for (const auto& c : ccs.coordinators()) {
if (std::count(exclusionVector.begin(), exclusionVector.end(), AddressExclusion(c.ip, c.port)) ||
std::count(exclusionVector.begin(), exclusionVector.end(), AddressExclusion(c.ip))) {
fprintf(stderr, "WARNING: %s is a coordinator!\n", c.toString().c_str());
foundCoordinator = true;
}
}
if (foundCoordinator)
printf("Type `help coordinators' for information on how to change the\n"
"cluster's coordination servers before removing them.\n");
return false;
}
}
ACTOR Future<bool> createSnapshot(Database db, std::vector<StringRef> tokens) {
state Standalone<StringRef> snapCmd;
state UID snapUID = deterministicRandom()->randomUniqueID();
@ -1366,6 +955,7 @@ struct CLIOptions {
bool trace = false;
std::string traceDir;
std::string traceFormat;
std::string logGroup;
int exit_timeout = 0;
Optional<std::string> exec;
bool initialStatusCheck = true;
@ -1468,6 +1058,9 @@ struct CLIOptions {
case OPT_TRACE_DIR:
traceDir = args.OptionArg();
break;
case OPT_LOGGROUP:
logGroup = args.OptionArg();
break;
case OPT_TIMEOUT: {
char* endptr;
exit_timeout = strtoul((char*)args.OptionArg(), &endptr, 10);
@ -2459,6 +2052,10 @@ int main(int argc, char** argv) {
setNetworkOption(FDBNetworkOptions::TRACE_FORMAT, StringRef(opt.traceFormat));
}
setNetworkOption(FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING);
if (!opt.logGroup.empty()) {
setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(opt.logGroup));
}
}
initHelp();

View File

@ -28,7 +28,7 @@
#include "fdbclient/CoordinationInterface.h"
IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs) {
IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
try {
using namespace boost::asio;

View File

@ -1133,8 +1133,8 @@ public:
filename,
IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_CREATE,
0600));
StreamCipher::Key::RawKeyType testKey;
generateRandomData(testKey.data(), testKey.size());
StreamCipherKey testKey(AES_256_KEY_LENGTH);
testKey.initializeRandomTestKey();
keyFile->write(testKey.data(), testKey.size(), 0);
wait(keyFile->sync());
return Void();
@ -1142,7 +1142,7 @@ public:
ACTOR static Future<Void> readEncryptionKey(std::string encryptionKeyFileName) {
state Reference<IAsyncFile> keyFile;
state StreamCipher::Key::RawKeyType key;
state StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey();
try {
Reference<IAsyncFile> _keyFile =
wait(IAsyncFileSystem::filesystem()->open(encryptionKeyFileName, 0x0, 0400));
@ -1153,15 +1153,14 @@ public:
.error(e);
throw e;
}
int bytesRead = wait(keyFile->read(key.data(), key.size(), 0));
if (bytesRead != key.size()) {
int bytesRead = wait(keyFile->read(cipherKey->data(), cipherKey->size(), 0));
if (bytesRead != cipherKey->size()) {
TraceEvent(SevWarnAlways, "InvalidEncryptionKeyFileSize")
.detail("ExpectedSize", key.size())
.detail("ExpectedSize", cipherKey->size())
.detail("ActualSize", bytesRead);
throw invalid_encryption_key_file();
}
ASSERT_EQ(bytesRead, key.size());
StreamCipher::Key::initializeKey(std::move(key));
ASSERT_EQ(bytesRead, cipherKey->size());
return Void();
}
#endif // ENCRYPTION_ENABLED

View File

@ -47,6 +47,7 @@ set(FDBCLIENT_SRCS
ConfigKnobs.h
ConfigTransactionInterface.cpp
ConfigTransactionInterface.h
ConvertUTF.h
CoordinationInterface.h
DatabaseBackupAgent.actor.cpp
DatabaseConfiguration.cpp
@ -58,10 +59,12 @@ set(FDBCLIENT_SRCS
FDBTypes.h
FluentDSampleIngestor.cpp
FileBackupAgent.actor.cpp
GenericManagementAPI.actor.h
GlobalConfig.h
GlobalConfig.actor.h
GlobalConfig.actor.cpp
GrvProxyInterface.h
HighContentionPrefixAllocator.actor.h
HTTP.actor.cpp
IClientApi.h
IConfigTransaction.cpp
@ -83,8 +86,6 @@ set(FDBCLIENT_SRCS
MonitorLeader.actor.cpp
MonitorLeader.h
MultiVersionAssignmentVars.h
ClientLibManagement.actor.cpp
ClientLibManagement.actor.h
MultiVersionTransaction.actor.cpp
MultiVersionTransaction.h
MutationList.h
@ -118,6 +119,7 @@ set(FDBCLIENT_SRCS
ServerKnobs.cpp
ServerKnobs.h
SimpleConfigTransaction.h
SimpleIni.h
SnapshotCache.h
SpecialKeySpace.actor.cpp
SpecialKeySpace.actor.h

View File

@ -67,7 +67,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( RESOURCE_CONSTRAINED_MAX_BACKOFF, 30.0 );
init( PROXY_COMMIT_OVERHEAD_BYTES, 23 ); //The size of serializing 7 tags (3 primary, 3 remote, 1 log router) + 2 for the tag length
init( SHARD_STAT_SMOOTH_AMOUNT, 5.0 );
init( INIT_MID_SHARD_BYTES, 200000 ); if( randomize && BUGGIFY ) INIT_MID_SHARD_BYTES = 40000; // The same value as SERVER_KNOBS->MIN_SHARD_BYTES
init( INIT_MID_SHARD_BYTES, 50000000 ); if( randomize && BUGGIFY ) INIT_MID_SHARD_BYTES = 40000; else if(randomize && !BUGGIFY) INIT_MID_SHARD_BYTES = 200000; // The same value as SERVER_KNOBS->MIN_SHARD_BYTES
init( TRANSACTION_SIZE_LIMIT, 1e7 );
init( KEY_SIZE_LIMIT, 1e4 );
@ -193,6 +193,8 @@ void ClientKnobs::initialize(Randomize randomize) {
init( HTTP_SEND_SIZE, 32*1024 );
init( HTTP_VERBOSE_LEVEL, 0 );
init( HTTP_REQUEST_ID_HEADER, "" );
init( HTTP_REQUEST_AWS_V4_HEADER, true );
init( BLOBSTORE_ENCRYPTION_TYPE, "" );
init( BLOBSTORE_CONNECT_TRIES, 10 );
init( BLOBSTORE_CONNECT_TIMEOUT, 10 );
init( BLOBSTORE_MAX_CONNECTION_LIFE, 120 );
@ -221,6 +223,12 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BLOBSTORE_READ_REQUESTS_PER_SECOND, 100 );
init( BLOBSTORE_DELETE_REQUESTS_PER_SECOND, 200 );
// Dynamic Knobs
init( COMMIT_QUORUM_TIMEOUT, 3.0 );
init( GET_GENERATION_QUORUM_TIMEOUT, 3.0 );
init( GET_KNOB_TIMEOUT, 3.0 );
init( TIMEOUT_RETRY_UPPER_BOUND, 20.0 );
// Client Status Info
init(CSI_SAMPLING_PROBABILITY, -1.0);
init(CSI_SIZE_LIMIT, std::numeric_limits<int64_t>::max());
@ -255,10 +263,6 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 );
init( BUSYNESS_SPIKE_SATURATED_THRESHOLD, 0.500 );
// multi-version client control
init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 );
init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 );
// blob granules
init( ENABLE_BLOB_GRANULES, false );

View File

@ -189,6 +189,12 @@ public:
int32_t DEFAULT_AUTO_RESOLVERS;
int32_t DEFAULT_AUTO_LOGS;
// Dynamic Knobs
double COMMIT_QUORUM_TIMEOUT;
double GET_GENERATION_QUORUM_TIMEOUT;
double GET_KNOB_TIMEOUT;
double TIMEOUT_RETRY_UPPER_BOUND;
// Client Status Info
double CSI_SAMPLING_PROBABILITY;
int64_t CSI_SIZE_LIMIT;
@ -198,6 +204,8 @@ public:
int HTTP_READ_SIZE;
int HTTP_VERBOSE_LEVEL;
std::string HTTP_REQUEST_ID_HEADER;
bool HTTP_REQUEST_AWS_V4_HEADER; // setting this knob to true will enable AWS V4 style header.
std::string BLOBSTORE_ENCRYPTION_TYPE;
int BLOBSTORE_CONNECT_TRIES;
int BLOBSTORE_CONNECT_TIMEOUT;
int BLOBSTORE_MAX_CONNECTION_LIFE;
@ -246,10 +254,6 @@ public:
double BUSYNESS_SPIKE_START_THRESHOLD;
double BUSYNESS_SPIKE_SATURATED_THRESHOLD;
// multi-version client control
int MVC_CLIENTLIB_CHUNK_SIZE;
int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION;
// blob granules
bool ENABLE_BLOB_GRANULES;

View File

@ -1,801 +0,0 @@
/*
* ClientLibManagement.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/ClientLibManagement.actor.h"
#include "fdbclient/Schemas.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/ClientKnobs.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/versions.h"
#include "fdbrpc/IAsyncFile.h"
#include "flow/Platform.h"
#include <algorithm>
#include <string>
#include <stdio.h>
#include "flow/Trace.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace ClientLibManagement {
struct ClientLibBinaryInfo {
size_t totalBytes = 0;
size_t chunkCnt = 0;
size_t chunkSize = 0;
Standalone<StringRef> sumBytes;
};
#define ASSERT_INDEX_IN_RANGE(idx, arr) ASSERT(idx >= 0 && idx < sizeof(arr) / sizeof(arr[0]))
const std::string& getStatusName(ClientLibStatus status) {
static const std::string statusNames[] = { "disabled", "uploading", "download", "active" };
int idx = static_cast<int>(status);
ASSERT_INDEX_IN_RANGE(idx, statusNames);
return statusNames[idx];
}
ClientLibStatus getStatusByName(std::string_view statusName) {
static std::map<std::string_view, ClientLibStatus> statusByName;
// initialize the map on demand
if (statusByName.empty()) {
for (int i = 0; i < static_cast<int>(ClientLibStatus::COUNT); i++) {
ClientLibStatus status = static_cast<ClientLibStatus>(i);
statusByName[getStatusName(status)] = status;
}
}
auto statusIter = statusByName.find(statusName);
if (statusIter == statusByName.cend()) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Error", format("Unknown status value %s", std::string(statusName).c_str()));
throw client_lib_invalid_metadata();
}
return statusIter->second;
}
const std::string& getPlatformName(ClientLibPlatform platform) {
static const std::string platformNames[] = { "unknown", "x84_64-linux", "x86_64-windows", "x86_64-macos" };
int idx = static_cast<int>(platform);
ASSERT_INDEX_IN_RANGE(idx, platformNames);
return platformNames[idx];
}
ClientLibPlatform getPlatformByName(std::string_view platformName) {
static std::map<std::string_view, ClientLibPlatform> platformByName;
// initialize the map on demand
if (platformByName.empty()) {
for (int i = 0; i < static_cast<int>(ClientLibPlatform::COUNT); i++) {
ClientLibPlatform platform = static_cast<ClientLibPlatform>(i);
platformByName[getPlatformName(platform)] = platform;
}
}
auto platfIter = platformByName.find(platformName);
if (platfIter == platformByName.cend()) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Error", format("Unknown platform value %s", std::string(platformName).c_str()));
throw client_lib_invalid_metadata();
}
return platfIter->second;
}
const std::string& getChecksumAlgName(ClientLibChecksumAlg checksumAlg) {
static const std::string checksumAlgNames[] = { "md5" };
int idx = static_cast<int>(checksumAlg);
ASSERT_INDEX_IN_RANGE(idx, checksumAlgNames);
return checksumAlgNames[idx];
}
ClientLibChecksumAlg getChecksumAlgByName(std::string_view checksumAlgName) {
static std::map<std::string_view, ClientLibChecksumAlg> checksumAlgByName;
// initialize the map on demand
if (checksumAlgByName.empty()) {
for (int i = 0; i < (int)ClientLibChecksumAlg::COUNT; i++) {
ClientLibChecksumAlg checksumAlg = static_cast<ClientLibChecksumAlg>(i);
checksumAlgByName[getChecksumAlgName(checksumAlg)] = checksumAlg;
}
}
auto iter = checksumAlgByName.find(checksumAlgName);
if (iter == checksumAlgByName.cend()) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Error", format("Unknown checksum algorithm %s", std::string(checksumAlgName).c_str()));
throw client_lib_invalid_metadata();
}
return iter->second;
}
namespace {
bool isValidTargetStatus(ClientLibStatus status) {
return status == ClientLibStatus::DISABLED || status == ClientLibStatus::DOWNLOAD ||
status == ClientLibStatus::ACTIVE;
}
bool isAvailableForDownload(ClientLibStatus status) {
return status == ClientLibStatus::DOWNLOAD || status == ClientLibStatus::ACTIVE;
}
void updateClientLibChangeCounter(Transaction& tr, ClientLibStatus prevStatus, ClientLibStatus newStatus) {
static const int64_t counterIncVal = 1;
if ((prevStatus != newStatus) &&
(newStatus == ClientLibStatus::DOWNLOAD || newStatus == ClientLibStatus::ACTIVE ||
prevStatus == ClientLibStatus::DOWNLOAD || prevStatus == ClientLibStatus::ACTIVE)) {
tr.atomicOp(clientLibChangeCounterKey,
StringRef(reinterpret_cast<const uint8_t*>(&counterIncVal), sizeof(counterIncVal)),
MutationRef::AddValue);
}
}
json_spirit::mObject parseMetadataJson(StringRef metadataString) {
json_spirit::mValue parsedMetadata;
if (!json_spirit::read_string(metadataString.toString(), parsedMetadata) ||
parsedMetadata.type() != json_spirit::obj_type) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Reason", "InvalidJSON")
.detail("Configuration", metadataString);
throw client_lib_invalid_metadata();
}
return parsedMetadata.get_obj();
}
const std::string& getMetadataStrAttr(const json_spirit::mObject& metadataJson, const std::string& attrName) {
auto attrIter = metadataJson.find(attrName);
if (attrIter == metadataJson.cend() || attrIter->second.type() != json_spirit::str_type) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Error", format("Missing attribute %s", attrName.c_str()));
throw client_lib_invalid_metadata();
}
return attrIter->second.get_str();
}
int getMetadataIntAttr(const json_spirit::mObject& metadataJson, const std::string& attrName) {
auto attrIter = metadataJson.find(attrName);
if (attrIter == metadataJson.cend() || attrIter->second.type() != json_spirit::int_type) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Error", format("Missing attribute %s", attrName.c_str()));
throw client_lib_invalid_metadata();
}
return attrIter->second.get_int();
}
bool validVersionPartNum(int num) {
return (num >= 0 && num < 1000);
}
int getNumericVersionEncoding(const std::string& versionStr) {
int major, minor, patch;
int charsScanned;
int numScanned = sscanf(versionStr.c_str(), "%d.%d.%d%n", &major, &minor, &patch, &charsScanned);
if (numScanned != 3 || !validVersionPartNum(major) || !validVersionPartNum(minor) || !validVersionPartNum(patch) ||
charsScanned != versionStr.size()) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Error", format("Invalid version string %s", versionStr.c_str()));
throw client_lib_invalid_metadata();
}
return ((major * 1000) + minor) * 1000 + patch;
}
Standalone<StringRef> getIdFromMetadataJson(const json_spirit::mObject& metadataJson) {
std::ostringstream libIdBuilder;
libIdBuilder << getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_PLATFORM) << "/";
libIdBuilder << format("%09d", getNumericVersionEncoding(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_VERSION)))
<< "/";
libIdBuilder << getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_TYPE) << "/";
libIdBuilder << getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_CHECKSUM);
return Standalone<StringRef>(libIdBuilder.str());
}
Key metadataKeyFromId(StringRef clientLibId) {
return clientLibId.withPrefix(clientLibMetadataPrefix);
}
Key chunkKeyPrefixFromId(StringRef clientLibId) {
return clientLibId.withPrefix(clientLibBinaryPrefix).withSuffix(LiteralStringRef("/"));
}
KeyRef chunkKeyFromNo(StringRef clientLibBinPrefix, size_t chunkNo, Arena& arena) {
return clientLibBinPrefix.withSuffix(format("%06zu", chunkNo), arena);
}
[[maybe_unused]] ClientLibPlatform getCurrentClientPlatform() {
#ifdef __x86_64__
#if defined(_WIN32)
return ClientLibPlatform::X86_64_WINDOWS;
#elif defined(__linux__)
return ClientLibPlatform::X86_64_LINUX;
#elif defined(__FreeBSD__) || defined(__APPLE__)
return ClientLibPlatform::X86_64_MACOS;
#else
return ClientLibPlatform::UNKNOWN;
#endif
#else // not __x86_64__
return ClientLibPlatform::UNKNOWN;
#endif
}
Standalone<StringRef> byteArrayToHexString(StringRef input) {
static const char* digits = "0123456789abcdef";
Standalone<StringRef> output = makeString(input.size() * 2);
char* pout = reinterpret_cast<char*>(mutateString(output));
for (const uint8_t* pin = input.begin(); pin != input.end(); ++pin) {
*pout++ = digits[(*pin >> 4) & 0xF];
*pout++ = digits[(*pin) & 0xF];
}
return output;
}
} // namespace
Standalone<StringRef> md5SumToHexString(MD5_CTX& sum) {
Standalone<StringRef> sumBytes = makeString(16);
::MD5_Final(mutateString(sumBytes), &sum);
return byteArrayToHexString(sumBytes);
}
ClientLibFilter& ClientLibFilter::filterNewerPackageVersion(const std::string& versionStr) {
matchNewerPackageVersion = true;
this->numericPkgVersion = getNumericVersionEncoding(versionStr);
return *this;
}
Standalone<StringRef> getClientLibIdFromMetadataJson(StringRef metadataString) {
json_spirit::mObject parsedMetadata = parseMetadataJson(metadataString);
return getIdFromMetadataJson(parsedMetadata);
}
namespace {
ACTOR Future<Void> uploadClientLibBinary(Database db,
StringRef libFilePath,
KeyRef chunkKeyPrefix,
ClientLibBinaryInfo* binInfo) {
state int chunkSize = getAlignedUpperBound(CLIENT_KNOBS->MVC_CLIENTLIB_CHUNK_SIZE, 1024);
state int transactionSize = std::max(CLIENT_KNOBS->MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 1) * chunkSize;
state size_t fileOffset = 0;
state size_t chunkNo = 0;
state MD5_CTX sum;
state Arena arena;
state StringRef buf;
state Transaction tr;
state size_t firstChunkNo;
// Disabling AIO, because it currently supports only page-aligned writes, but the size of a client library
// is not necessariliy page-aligned, need to investigate if it is a limitation of AIO or just the way
// we are wrapping it
state Reference<IAsyncFile> fClientLib = wait(IAsyncFileSystem::filesystem()->open(
libFilePath.toString(), IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO, 0));
::MD5_Init(&sum);
loop {
arena = Arena();
// Use page-aligned buffers for enabling possible future use with AIO
buf = makeAlignedString(_PAGE_SIZE, transactionSize, arena);
state int bytesRead = wait(fClientLib->read(mutateString(buf), transactionSize, fileOffset));
fileOffset += bytesRead;
if (bytesRead <= 0) {
break;
}
::MD5_Update(&sum, buf.begin(), bytesRead);
tr = Transaction(db);
firstChunkNo = chunkNo;
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
int bufferOffset = 0;
chunkNo = firstChunkNo;
while (bufferOffset < bytesRead) {
size_t chunkLen = std::min(chunkSize, bytesRead - bufferOffset);
KeyRef chunkKey = chunkKeyFromNo(chunkKeyPrefix, chunkNo, arena);
chunkNo++;
tr.set(chunkKey, ValueRef(mutateString(buf) + bufferOffset, chunkLen));
bufferOffset += chunkLen;
}
wait(tr.commit());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
if (bytesRead < transactionSize) {
break;
}
}
binInfo->totalBytes = fileOffset;
binInfo->chunkCnt = chunkNo;
binInfo->chunkSize = chunkSize;
binInfo->sumBytes = md5SumToHexString(sum);
return Void();
}
} // namespace
ACTOR Future<Void> uploadClientLibrary(Database db,
Standalone<StringRef> metadataString,
Standalone<StringRef> libFilePath) {
state json_spirit::mObject metadataJson;
state Standalone<StringRef> clientLibId;
state Key clientLibMetaKey;
state Key clientLibBinPrefix;
state std::string jsStr;
state Transaction tr;
state ClientLibBinaryInfo binInfo;
state ClientLibStatus targetStatus;
metadataJson = parseMetadataJson(metadataString);
json_spirit::mValue schema;
if (!json_spirit::read_string(JSONSchemas::clientLibMetadataSchema.toString(), schema)) {
ASSERT(false);
}
std::string errorStr;
if (!schemaMatch(schema.get_obj(), metadataJson, errorStr, SevWarnAlways)) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Reason", "SchemaMismatch")
.detail("Configuration", metadataString)
.detail("Error", errorStr);
throw client_lib_invalid_metadata();
}
clientLibId = getIdFromMetadataJson(metadataJson);
clientLibMetaKey = metadataKeyFromId(clientLibId);
clientLibBinPrefix = chunkKeyPrefixFromId(clientLibId);
targetStatus = getStatusByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_STATUS));
if (!isValidTargetStatus(targetStatus)) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Reason", "InvalidTargetStatus")
.detail("Configuration", metadataString);
throw client_lib_invalid_metadata();
}
// check if checksumalg and platform attributes have valid values
getChecksumAlgByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_CHECKSUM_ALG));
getPlatformByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_PLATFORM));
// Check if further mandatory attributes are set
getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_GIT_HASH);
getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_PROTOCOL);
getMetadataIntAttr(metadataJson, CLIENTLIB_ATTR_API_VERSION);
metadataJson[CLIENTLIB_ATTR_STATUS] = getStatusName(ClientLibStatus::UPLOADING);
jsStr = json_spirit::write_string(json_spirit::mValue(metadataJson));
/*
* Check if the client library with the same identifier already exists.
* If not, write its metadata with "uploading" state to prevent concurrent uploads
*/
tr = Transaction(db);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> existingMeta = wait(tr.get(clientLibMetaKey));
if (existingMeta.present()) {
TraceEvent(SevWarnAlways, "ClientLibraryAlreadyExists")
.detail("Key", clientLibMetaKey)
.detail("ExistingMetadata", existingMeta.get().toString());
throw client_lib_already_exists();
}
TraceEvent("ClientLibraryBeginUpload").detail("Key", clientLibMetaKey);
tr.set(clientLibMetaKey, ValueRef(jsStr));
wait(tr.commit());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
/*
* Upload the binary of the client library in chunks
*/
wait(uploadClientLibBinary(db, libFilePath, clientLibBinPrefix, &binInfo));
std::string checkSum = getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_CHECKSUM);
if (binInfo.sumBytes != StringRef(checkSum)) {
TraceEvent(SevWarnAlways, "ClientLibraryChecksumMismatch")
.detail("Expected", checkSum)
.detail("Actual", binInfo.sumBytes)
.detail("Configuration", metadataString);
// Rollback the upload operation
try {
wait(deleteClientLibrary(db, clientLibId));
} catch (Error& e) {
TraceEvent(SevError, "ClientLibraryUploadRollbackFailed").error(e);
}
throw client_lib_invalid_binary();
}
/*
* Update the metadata entry, with additional information about the binary
* and change its state from "uploading" to the given one
*/
metadataJson[CLIENTLIB_ATTR_SIZE] = static_cast<int64_t>(binInfo.totalBytes);
metadataJson[CLIENTLIB_ATTR_CHUNK_COUNT] = static_cast<int64_t>(binInfo.chunkCnt);
metadataJson[CLIENTLIB_ATTR_CHUNK_SIZE] = static_cast<int64_t>(binInfo.chunkSize);
metadataJson[CLIENTLIB_ATTR_FILENAME] = basename(libFilePath.toString());
metadataJson[CLIENTLIB_ATTR_STATUS] = getStatusName(targetStatus);
jsStr = json_spirit::write_string(json_spirit::mValue(metadataJson));
tr.reset();
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.set(clientLibMetaKey, ValueRef(jsStr));
updateClientLibChangeCounter(tr, ClientLibStatus::DISABLED, targetStatus);
wait(tr.commit());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
TraceEvent("ClientLibraryUploadDone").detail("Key", clientLibMetaKey);
return Void();
}
ACTOR Future<Void> downloadClientLibrary(Database db,
Standalone<StringRef> clientLibId,
Standalone<StringRef> libFilePath) {
state Key clientLibMetaKey = metadataKeyFromId(clientLibId);
state Key chunkKeyPrefix = chunkKeyPrefixFromId(clientLibId);
state int chunksPerTransaction = std::max(CLIENT_KNOBS->MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 1);
state int transactionSize;
state json_spirit::mObject metadataJson;
state std::string checkSum;
state size_t chunkCount;
state size_t binarySize;
state size_t expectedChunkSize;
state Transaction tr;
state size_t fileOffset;
state MD5_CTX sum;
state Arena arena;
state StringRef buf;
state size_t bufferOffset;
state size_t fromChunkNo;
state size_t toChunkNo;
state std::vector<Future<Optional<Value>>> chunkFutures;
TraceEvent("ClientLibraryBeginDownload").detail("Key", clientLibMetaKey);
/*
* First read the metadata to get information about the status and
* the chunk count of the client library
*/
loop {
tr = Transaction(db);
try {
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<Value> metadataOpt = wait(tr.get(clientLibMetaKey));
if (!metadataOpt.present()) {
TraceEvent(SevWarnAlways, "ClientLibraryNotFound").detail("Key", clientLibMetaKey);
throw client_lib_not_found();
}
metadataJson = parseMetadataJson(metadataOpt.get());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
// Prevent downloading not yet uploaded and disabled libraries
if (!isAvailableForDownload(getStatusByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_STATUS)))) {
throw client_lib_not_available();
}
// Disabling AIO, because it currently supports only page-aligned writes, but the size of a client library
// is not necessariliy page-aligned, need to investigate if it is a limitation of AIO or just the way
// we are wrapping it
int64_t flags = IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_CREATE |
IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO;
state Reference<IAsyncFile> fClientLib =
wait(IAsyncFileSystem::filesystem()->open(libFilePath.toString(), flags, 0666));
checkSum = getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_CHECKSUM);
chunkCount = getMetadataIntAttr(metadataJson, CLIENTLIB_ATTR_CHUNK_COUNT);
binarySize = getMetadataIntAttr(metadataJson, CLIENTLIB_ATTR_SIZE);
expectedChunkSize = getMetadataIntAttr(metadataJson, CLIENTLIB_ATTR_CHUNK_SIZE);
transactionSize = chunksPerTransaction * expectedChunkSize;
fileOffset = 0;
fromChunkNo = 0;
::MD5_Init(&sum);
arena = Arena();
// Use page-aligned buffers for enabling possible future use with AIO
buf = makeAlignedString(_PAGE_SIZE, transactionSize, arena);
loop {
if (fromChunkNo == chunkCount) {
break;
}
tr = Transaction(db);
toChunkNo = std::min(chunkCount, fromChunkNo + chunksPerTransaction);
// read a batch of file chunks concurrently
loop {
try {
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
chunkFutures.clear();
for (size_t chunkNo = fromChunkNo; chunkNo < toChunkNo; chunkNo++) {
KeyRef chunkKey = chunkKeyFromNo(chunkKeyPrefix, chunkNo, arena);
chunkFutures.push_back(tr.get(chunkKey));
}
wait(waitForAll(chunkFutures));
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
// check the read chunks and copy them to a buffer
bufferOffset = 0;
size_t chunkNo = fromChunkNo;
for (auto chunkOptFuture : chunkFutures) {
if (!chunkOptFuture.get().present()) {
TraceEvent(SevWarnAlways, "ClientLibraryChunkNotFound")
.detail("Key", chunkKeyFromNo(chunkKeyPrefix, chunkNo, arena));
throw client_lib_invalid_binary();
}
StringRef chunkVal = chunkOptFuture.get().get();
// All chunks exept for the last one must be of the expected size to guarantee
// alignment when writing to file
if ((chunkNo != (chunkCount - 1) && chunkVal.size() != expectedChunkSize) ||
chunkVal.size() > expectedChunkSize) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidChunkSize")
.detail("Key", chunkKeyFromNo(chunkKeyPrefix, chunkNo, arena))
.detail("MaxSize", expectedChunkSize)
.detail("ActualSize", chunkVal.size());
throw client_lib_invalid_binary();
}
memcpy(mutateString(buf) + bufferOffset, chunkVal.begin(), chunkVal.size());
bufferOffset += chunkVal.size();
chunkNo++;
}
// write the chunks to the file, update checksum
if (bufferOffset > 0) {
wait(fClientLib->write(buf.begin(), bufferOffset, fileOffset));
fileOffset += bufferOffset;
::MD5_Update(&sum, buf.begin(), bufferOffset);
}
// move to the next batch
fromChunkNo = toChunkNo;
}
// check if the downloaded file size is as expected
if (fileOffset != binarySize) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidSize")
.detail("ExpectedSize", binarySize)
.detail("ActualSize", fileOffset);
throw client_lib_invalid_binary();
}
// check if the checksum of downloaded file is as expected
Standalone<StringRef> sumBytesStr = md5SumToHexString(sum);
if (sumBytesStr != StringRef(checkSum)) {
TraceEvent(SevWarnAlways, "ClientLibraryChecksumMismatch")
.detail("Expected", checkSum)
.detail("Actual", sumBytesStr)
.detail("Key", clientLibMetaKey);
throw client_lib_invalid_binary();
}
wait(fClientLib->sync());
TraceEvent("ClientLibraryDownloadDone").detail("Key", clientLibMetaKey);
return Void();
}
ACTOR Future<Void> deleteClientLibrary(Database db, Standalone<StringRef> clientLibId) {
state Key clientLibMetaKey = metadataKeyFromId(clientLibId.toString());
state Key chunkKeyPrefix = chunkKeyPrefixFromId(clientLibId.toString());
TraceEvent("ClientLibraryBeginDelete").detail("Key", clientLibMetaKey);
loop {
state Transaction tr(db);
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> metadataOpt = wait(tr.get(clientLibMetaKey));
if (!metadataOpt.present()) {
TraceEvent(SevWarnAlways, "ClientLibraryNotFound").detail("Key", clientLibMetaKey);
throw client_lib_not_found();
}
json_spirit::mObject metadataJson = parseMetadataJson(metadataOpt.get());
ClientLibStatus status = getStatusByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_STATUS));
tr.clear(prefixRange(chunkKeyPrefix));
tr.clear(clientLibMetaKey);
updateClientLibChangeCounter(tr, status, ClientLibStatus::DISABLED);
wait(tr.commit());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
TraceEvent("ClientLibraryDeleteDone").detail("Key", clientLibMetaKey);
return Void();
}
namespace {
void applyClientLibFilter(const ClientLibFilter& filter,
const RangeResultRef& scanResults,
Standalone<VectorRef<StringRef>>& filteredResults) {
for (const auto& [k, v] : scanResults) {
try {
json_spirit::mObject metadataJson = parseMetadataJson(v);
if (filter.matchAvailableOnly &&
!isAvailableForDownload(getStatusByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_STATUS)))) {
continue;
}
if (filter.matchCompatibleAPI &&
getMetadataIntAttr(metadataJson, CLIENTLIB_ATTR_API_VERSION) < filter.apiVersion) {
continue;
}
if (filter.matchNewerPackageVersion && !filter.matchPlatform &&
getNumericVersionEncoding(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_VERSION)) <=
filter.numericPkgVersion) {
continue;
}
filteredResults.push_back_deep(filteredResults.arena(), v);
} catch (Error& e) {
// Entries with invalid metadata on the cluster
// Can happen only if the official management interface is bypassed
ASSERT(e.code() == error_code_client_lib_invalid_metadata);
TraceEvent(SevError, "ClientLibraryIgnoringInvalidMetadata").detail("Metadata", v);
}
}
}
} // namespace
ACTOR Future<Standalone<VectorRef<StringRef>>> listClientLibraries(Database db, ClientLibFilter filter) {
state Standalone<VectorRef<StringRef>> result;
state Transaction tr(db);
state PromiseStream<Standalone<RangeResultRef>> scanResults;
state Key fromKey;
state Key toKey;
state KeyRangeRef scanRange;
state Future<Void> stream;
loop {
try {
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
if (filter.matchPlatform) {
Key prefixWithPlatform =
clientLibMetadataPrefix.withSuffix(std::string(getPlatformName(filter.platformVal)));
fromKey = prefixWithPlatform.withSuffix(LiteralStringRef("/"));
if (filter.matchNewerPackageVersion) {
fromKey = fromKey.withSuffix(format("%09d", filter.numericPkgVersion + 1));
}
toKey = prefixWithPlatform.withSuffix(LiteralStringRef("0"));
scanRange = KeyRangeRef(fromKey, toKey);
} else {
scanRange = clientLibMetadataKeys;
}
scanResults = PromiseStream<Standalone<RangeResultRef>>();
stream = tr.getRangeStream(scanResults, scanRange, GetRangeLimits());
loop {
Standalone<RangeResultRef> scanResultRange = waitNext(scanResults.getFuture());
applyClientLibFilter(filter, scanResultRange, result);
}
} catch (Error& e) {
if (e.code() == error_code_end_of_stream) {
break;
}
wait(tr.onError(e));
}
}
return result;
}
ACTOR Future<ClientLibStatus> getClientLibraryStatus(Database db, Standalone<StringRef> clientLibId) {
state Key clientLibMetaKey = metadataKeyFromId(clientLibId);
state Transaction tr(db);
loop {
try {
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<Value> metadataOpt = wait(tr.get(clientLibMetaKey));
if (!metadataOpt.present()) {
TraceEvent(SevWarnAlways, "ClientLibraryNotFound").detail("Key", clientLibMetaKey);
throw client_lib_not_found();
}
json_spirit::mObject metadataJson = parseMetadataJson(metadataOpt.get());
return getStatusByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_STATUS));
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> changeClientLibraryStatus(Database db,
Standalone<StringRef> clientLibId,
ClientLibStatus newStatus) {
state Key clientLibMetaKey = metadataKeyFromId(clientLibId);
state json_spirit::mObject metadataJson;
state std::string jsStr;
state Transaction tr;
if (!isValidTargetStatus(newStatus)) {
TraceEvent(SevWarnAlways, "ClientLibraryInvalidMetadata")
.detail("Reason", "InvalidTargetStatus")
.detail("Status", getStatusName(newStatus));
throw client_lib_invalid_metadata();
}
loop {
tr = Transaction(db);
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> metadataOpt = wait(tr.get(clientLibMetaKey));
if (!metadataOpt.present()) {
TraceEvent(SevWarnAlways, "ClientLibraryNotFound").detail("Key", clientLibMetaKey);
throw client_lib_not_found();
}
metadataJson = parseMetadataJson(metadataOpt.get());
ClientLibStatus prevStatus = getStatusByName(getMetadataStrAttr(metadataJson, CLIENTLIB_ATTR_STATUS));
if (prevStatus == newStatus) {
return Void();
}
metadataJson[CLIENTLIB_ATTR_STATUS] = getStatusName(newStatus);
jsStr = json_spirit::write_string(json_spirit::mValue(metadataJson));
tr.set(clientLibMetaKey, ValueRef(jsStr));
updateClientLibChangeCounter(tr, prevStatus, newStatus);
wait(tr.commit());
break;
} catch (Error& e) {
if (e.code() == error_code_client_lib_not_found) {
throw;
}
wait(tr.onError(e));
}
}
TraceEvent("ClientLibraryStatusChanged").detail("Key", clientLibMetaKey).detail("Status", getStatusName(newStatus));
return Void();
}
} // namespace ClientLibManagement

View File

@ -1,146 +0,0 @@
/*
* ClientLibManagement.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_MULTI_VERSION_CLIENT_CONTROL_ACTOR_G_H)
#define FDBCLIENT_MULTI_VERSION_CLIENT_CONTROL_ACTOR_G_H
#include "fdbclient/ClientLibManagement.actor.g.h"
#elif !defined(FDBCLIENT_MULTI_VERSION_CLIENT_CONTROL_ACTOR_H)
#define FDBCLIENT_MULTI_VERSION_CLIENT_CONTROL_ACTOR_H
#include <string>
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/md5/md5.h"
#include "flow/actorcompiler.h" // has to be last include
namespace ClientLibManagement {
enum class ClientLibStatus {
DISABLED = 0,
UPLOADING, // 1
DOWNLOAD, // 2
ACTIVE, // 3
COUNT // must be the last one
};
enum class ClientLibPlatform {
UNKNOWN = 0,
X86_64_LINUX,
X86_64_WINDOWS,
X86_64_MACOS,
COUNT // must be the last one
};
// Currently we support only one,
// but we may want to change it in the future
enum class ClientLibChecksumAlg {
MD5 = 0,
COUNT // must be the last one
};
inline const std::string CLIENTLIB_ATTR_PLATFORM{ "platform" };
inline const std::string CLIENTLIB_ATTR_STATUS{ "status" };
inline const std::string CLIENTLIB_ATTR_CHECKSUM{ "checksum" };
inline const std::string CLIENTLIB_ATTR_VERSION{ "version" };
inline const std::string CLIENTLIB_ATTR_TYPE{ "type" };
inline const std::string CLIENTLIB_ATTR_API_VERSION{ "apiversion" };
inline const std::string CLIENTLIB_ATTR_PROTOCOL{ "protocol" };
inline const std::string CLIENTLIB_ATTR_GIT_HASH{ "githash" };
inline const std::string CLIENTLIB_ATTR_FILENAME{ "filename" };
inline const std::string CLIENTLIB_ATTR_SIZE{ "size" };
inline const std::string CLIENTLIB_ATTR_CHUNK_COUNT{ "chunkcount" };
inline const std::string CLIENTLIB_ATTR_CHUNK_SIZE{ "chunksize" };
inline const std::string CLIENTLIB_ATTR_CHECKSUM_ALG{ "checksumalg" };
struct ClientLibFilter {
bool matchAvailableOnly = false;
bool matchPlatform = false;
bool matchCompatibleAPI = false;
bool matchNewerPackageVersion = false;
ClientLibPlatform platformVal = ClientLibPlatform::UNKNOWN;
int apiVersion = 0;
int numericPkgVersion = 0;
ClientLibFilter& filterAvailable() {
matchAvailableOnly = true;
return *this;
}
ClientLibFilter& filterPlatform(ClientLibPlatform platformVal) {
matchPlatform = true;
this->platformVal = platformVal;
return *this;
}
ClientLibFilter& filterCompatibleAPI(int apiVersion) {
matchCompatibleAPI = true;
this->apiVersion = apiVersion;
return *this;
}
// expects a version string like "6.3.10"
ClientLibFilter& filterNewerPackageVersion(const std::string& versionStr);
};
const std::string& getStatusName(ClientLibStatus status);
ClientLibStatus getStatusByName(std::string_view statusName);
const std::string& getPlatformName(ClientLibPlatform platform);
ClientLibPlatform getPlatformByName(std::string_view platformName);
const std::string& getChecksumAlgName(ClientLibChecksumAlg checksumAlg);
ClientLibChecksumAlg getChecksumAlgByName(std::string_view checksumAlgName);
// encodes MD5 result to a hexadecimal string to be provided in the checksum attribute
Standalone<StringRef> md5SumToHexString(MD5_CTX& sum);
// Upload a client library binary from a file and associated metadata JSON
// to the system keyspace of the database
ACTOR Future<Void> uploadClientLibrary(Database db,
Standalone<StringRef> metadataString,
Standalone<StringRef> libFilePath);
// Determine clientLibId from the relevant attributes of the metadata JSON
Standalone<StringRef> getClientLibIdFromMetadataJson(StringRef metadataString);
// Download a client library binary from the system keyspace of the database
// and save it at the given file path
ACTOR Future<Void> downloadClientLibrary(Database db,
Standalone<StringRef> clientLibId,
Standalone<StringRef> libFilePath);
// Delete the client library binary from to the system keyspace of the database
ACTOR Future<Void> deleteClientLibrary(Database db, Standalone<StringRef> clientLibId);
// List client libraries available on the cluster, with the specified filter
// Returns metadata JSON of each library
ACTOR Future<Standalone<VectorRef<StringRef>>> listClientLibraries(Database db, ClientLibFilter filter);
// Get the current status of an uploaded client library
ACTOR Future<ClientLibStatus> getClientLibraryStatus(Database db, Standalone<StringRef> clientLibId);
// Change client library metadata status
ACTOR Future<Void> changeClientLibraryStatus(Database db, Standalone<StringRef> clientLibId, ClientLibStatus newStatus);
} // namespace ClientLibManagement
#include "flow/unactorcompiler.h"
#endif

View File

@ -37,7 +37,8 @@ enum class EventType {
UNSET
};
enum class TransactionPriorityType { PRIORITY_DEFAULT = 0, PRIORITY_BATCH = 1, PRIORITY_IMMEDIATE = 2, UNSET };
enum class TransactionPriorityType : int { PRIORITY_DEFAULT = 0, PRIORITY_BATCH = 1, PRIORITY_IMMEDIATE = 2, UNSET };
static_assert(sizeof(TransactionPriorityType) == 4, "transaction_profiling_analyzer.py assumes this field has size 4");
struct Event {
Event(EventType t, double ts, const Optional<Standalone<StringRef>>& dc) : type(t), startTs(ts) {

View File

@ -41,7 +41,7 @@ ClusterConnectionFile::ClusterConnectionFile(std::string const& filename, Cluste
}
// Sets the connections string held by this object and persists it.
Future<Void> ClusterConnectionFile::setConnectionString(ClusterConnectionString const& conn) {
Future<Void> ClusterConnectionFile::setAndPersistConnectionString(ClusterConnectionString const& conn) {
ASSERT(filename.size());
cs = conn;
return success(persist());

View File

@ -35,7 +35,7 @@ public:
explicit ClusterConnectionFile(std::string const& filename, ClusterConnectionString const& contents);
// Sets the connections string held by this object and persists it.
Future<Void> setConnectionString(ClusterConnectionString const&) override;
Future<Void> setAndPersistConnectionString(ClusterConnectionString const&) override;
// Get the connection string stored in the file.
Future<ClusterConnectionString> getStoredConnectionString() override;

View File

@ -57,7 +57,7 @@ ACTOR Future<Reference<ClusterConnectionKey>> ClusterConnectionKey::loadClusterC
}
// Sets the connections string held by this object and persists it.
Future<Void> ClusterConnectionKey::setConnectionString(ClusterConnectionString const& connectionString) {
Future<Void> ClusterConnectionKey::setAndPersistConnectionString(ClusterConnectionString const& connectionString) {
cs = connectionString;
return success(persist());
}

View File

@ -48,7 +48,7 @@ public:
ACTOR static Future<Reference<ClusterConnectionKey>> loadClusterConnectionKey(Database db, Key connectionStringKey);
// Sets the connections string held by this object and persists it.
Future<Void> setConnectionString(ClusterConnectionString const&) override;
Future<Void> setAndPersistConnectionString(ClusterConnectionString const&) override;
// Get the connection string stored in the database.
Future<ClusterConnectionString> getStoredConnectionString() override;

View File

@ -23,7 +23,7 @@
#include "flow/actorcompiler.h" // has to be last include
// Sets the connections string held by this object.
Future<Void> ClusterConnectionMemoryRecord::setConnectionString(ClusterConnectionString const& conn) {
Future<Void> ClusterConnectionMemoryRecord::setAndPersistConnectionString(ClusterConnectionString const& conn) {
cs = conn;
return Void();
}

View File

@ -36,7 +36,7 @@ public:
}
// Sets the connections string held by this object.
Future<Void> setConnectionString(ClusterConnectionString const&) override;
Future<Void> setAndPersistConnectionString(ClusterConnectionString const&) override;
// Returns the connection string currently held in this object (there is no persistent storage).
Future<ClusterConnectionString> getStoredConnectionString() override;

View File

@ -115,9 +115,6 @@ struct ClientDBInfo {
firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
Optional<Value> forward;
std::vector<VersionHistory> history;
// a counter increased every time a change of uploaded client libraries
// happens, the clients need to be aware of
uint64_t clientLibChangeCounter = 0;
ClientDBInfo() {}
@ -129,7 +126,7 @@ struct ClientDBInfo {
if constexpr (!is_fb_function<Archive>) {
ASSERT(ar.protocolVersion().isValid());
}
serializer(ar, grvProxies, commitProxies, id, forward, history, clientLibChangeCounter);
serializer(ar, grvProxies, commitProxies, id, forward, history);
}
};

View File

@ -39,6 +39,12 @@ struct ConfigGeneration {
bool operator<(ConfigGeneration const&) const;
bool operator>(ConfigGeneration const&) const;
std::string toString() const {
std::stringstream ss;
ss << "liveVersion: " << liveVersion << ", committedVersion: " << committedVersion;
return ss.str();
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, committedVersion, liveVersion);

View File

@ -59,22 +59,37 @@ struct ClientLeaderRegInterface {
class ClusterConnectionString {
public:
ClusterConnectionString() {}
ClusterConnectionString(std::string const& connectionString);
ClusterConnectionString(std::vector<NetworkAddress>, Key);
ClusterConnectionString(const std::string& connStr);
ClusterConnectionString(const std::vector<NetworkAddress>& coordinators, Key key);
ClusterConnectionString(const std::vector<Hostname>& hosts, Key key);
std::vector<NetworkAddress> const& coordinators() const { return coord; }
std::vector<NetworkAddress> const& coordinators() const { return coords; }
void addResolved(Hostname hostname, NetworkAddress address) {
coords.push_back(address);
networkAddressToHostname.emplace(address, hostname);
}
Key clusterKey() const { return key; }
Key clusterKeyName() const {
return keyDesc;
} // Returns the "name" or "description" part of the clusterKey (the part before the ':')
std::string toString() const;
static std::string getErrorString(std::string const& source, Error const& e);
Future<Void> resolveHostnames();
// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
// should be preferred.
void resolveHostnamesBlocking();
void resetToUnresolved();
bool hasUnresolvedHostnames = false;
std::vector<NetworkAddress> coords;
std::vector<Hostname> hostnames;
private:
void parseKey(std::string const& key);
std::vector<NetworkAddress> coord;
void parseConnString();
void parseKey(const std::string& key);
std::unordered_map<NetworkAddress, Hostname> networkAddressToHostname;
Key key, keyDesc;
std::string connectionString;
};
FDB_DECLARE_BOOLEAN_PARAM(ConnectionStringNeedsPersisted);
@ -93,10 +108,10 @@ public:
// Returns the connection string currently held in this object. This may not match the stored record if it hasn't
// been persisted or if the persistent storage for the record has been modified externally.
ClusterConnectionString const& getConnectionString() const;
ClusterConnectionString& getConnectionString();
// Sets the connections string held by this object and persists it.
virtual Future<Void> setConnectionString(ClusterConnectionString const&) = 0;
virtual Future<Void> setAndPersistConnectionString(ClusterConnectionString const&) = 0;
// If this record is backed by persistent storage, get the connection string from that storage. Otherwise, return
// the connection string stored in memory.
@ -124,6 +139,12 @@ public:
// Signals to the connection record that it was successfully used to connect to a cluster.
void notifyConnected();
bool hasUnresolvedHostnames() const;
Future<Void> resolveHostnames();
// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
// should be preferred.
void resolveHostnamesBlocking();
virtual void addref() = 0;
virtual void delref() = 0;
@ -151,7 +172,10 @@ struct LeaderInfo {
UID changeID;
static const uint64_t changeIDMask = ~(uint64_t(0b1111111) << 57);
Value serializedInfo;
bool forward; // If true, serializedInfo is a connection string instead!
// If true, serializedInfo is a connection string instead!
// If true, it also means the receipient need to update their local cluster file
// with the latest list of coordinators
bool forward;
LeaderInfo() : forward(false) {}
LeaderInfo(UID changeID) : changeID(changeID), forward(false) {}

View File

@ -2366,7 +2366,7 @@ std::string getDRMutationStreamId(StatusObjectReader statusObj, const char* cont
bool getLockedStatus(StatusObjectReader statusObj) {
try {
StatusObjectReader statusObjCluster = statusObj["cluster"].get_obj();
return statusObjCluster["database_locked"].get_bool();
return statusObjCluster["database_lock_state.locked"].get_bool();
} catch (std::runtime_error& e) {
TraceEvent(SevWarn, "DBA_GetLockedStatusFail").detail("Error", e.what());
throw backup_error();

View File

@ -249,7 +249,6 @@ public:
Future<Reference<CommitProxyInfo>> getCommitProxiesFuture(UseProvisionalProxies useProvisionalProxies);
Reference<GrvProxyInfo> getGrvProxies(UseProvisionalProxies useProvisionalProxies);
Future<Void> onProxiesChanged() const;
Future<Void> onClientLibStatusChanged() const;
Future<HealthMetrics> getHealthMetrics(bool detailed);
// Pass a negative value for `shardLimit` to indicate no limit on the shard number.
Future<StorageMetrics> getStorageMetrics(KeyRange const& keys, int shardLimit);
@ -347,7 +346,6 @@ public:
// Key DB-specific information
Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord;
AsyncTrigger proxiesChangeTrigger;
AsyncTrigger clientLibChangeTrigger;
Future<Void> clientDBInfoMonitor;
Future<Void> monitorTssInfoChange;
Future<Void> tssMismatchHandler;

View File

@ -665,6 +665,10 @@ struct RangeResultRef : VectorRef<KeyValueRef> {
serializer(ar, ((VectorRef<KeyValueRef>&)*this), more, readThrough, readToBegin, readThroughEnd);
}
int logicalSize() const {
return VectorRef<KeyValueRef>::expectedSize() - VectorRef<KeyValueRef>::size() * sizeof(KeyValueRef);
}
std::string toString() const {
return "more:" + std::to_string(more) +
" readThrough:" + (readThrough.present() ? readThrough.get().toString() : "[unset]") +
@ -736,16 +740,18 @@ struct TLogVersion {
// V4 changed how data gets written to satellite TLogs so that we can peek from them;
// V5 merged reference and value spilling
// V6 added span context to list of serialized mutations sent from proxy to tlogs
// V7 use xxhash3 for TLog checksum
// V1 = 1, // 4.6 is dispatched to via 6.0
V2 = 2, // 6.0
V3 = 3, // 6.1
V4 = 4, // 6.2
V5 = 5, // 6.3
V6 = 6, // 7.0
V7 = 7, // 7.2
MIN_SUPPORTED = V2,
MAX_SUPPORTED = V6,
MIN_RECRUITABLE = V5,
DEFAULT = V5,
MAX_SUPPORTED = V7,
MIN_RECRUITABLE = V6,
DEFAULT = V6,
} version;
TLogVersion() : version(UNSET) {}
@ -771,6 +777,8 @@ struct TLogVersion {
return V5;
if (s == LiteralStringRef("6"))
return V6;
if (s == LiteralStringRef("7"))
return V7;
return default_error_or();
}
};
@ -1192,4 +1200,34 @@ struct ReadBlobGranuleContext {
bool debugNoMaterialize;
};
// Store metadata associated with each storage server. Now it only contains data be used in perpetual storage wiggle.
struct StorageMetadataType {
constexpr static FileIdentifier file_identifier = 732123;
// when the SS is initialized
uint64_t createdTime; // comes from Platform::timer_int()
StorageMetadataType() : createdTime(0) {}
StorageMetadataType(uint64_t t) : createdTime(t) {}
// To change this serialization, ProtocolVersion::StorageMetadata must be updated, and downgrades need
// to be considered
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, createdTime);
}
};
// store metadata of wiggle action
struct StorageWiggleValue {
constexpr static FileIdentifier file_identifier = 732124;
UID id; // storage id
StorageWiggleValue(UID id = UID(0, 0)) : id(id) {}
// To change this serialization, ProtocolVersion::PerpetualWiggleMetadata must be updated, and downgrades need
// to be considered
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id);
}
};
#endif

View File

@ -0,0 +1,632 @@
/*
* GenericManagementAPI.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GENERIC_MANAGEMENT_API_ACTOR_G_H)
#define FDBCLIENT_GENERIC_MANAGEMENT_API_ACTOR_G_H
#include "fdbclient/GenericManagementAPI.actor.g.h"
#elif !defined(FDBCLIENT_GENERIC_MANAGEMENT_API_ACTOR_H)
#define FDBCLIENT_GENERIC_MANAGEMENT_API_ACTOR_H
/* This file defines "management" interfaces that have been templated to support both IClientAPI
and Native version of databases, transactions, etc., and includes functions for performing cluster
managment tasks. It isn't exposed to C clients or anywhere outside our code base and doesn't need
to be versioned. It doesn't do anything you can't do with the standard API and some knowledge of
the contents of the system key space.
*/
#include <string>
#include <map>
#include "fdbclient/ClientBooleanParams.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/Status.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // has to be last include
// ConfigurationResult enumerates normal outcomes of changeConfig() and various error
// conditions specific to it. changeConfig may also throw an Error to report other problems.
enum class ConfigurationResult {
NO_OPTIONS_PROVIDED,
CONFLICTING_OPTIONS,
UNKNOWN_OPTION,
INCOMPLETE_CONFIGURATION,
INVALID_CONFIGURATION,
STORAGE_MIGRATION_DISABLED,
DATABASE_ALREADY_CREATED,
DATABASE_CREATED,
DATABASE_UNAVAILABLE,
STORAGE_IN_UNKNOWN_DCID,
REGION_NOT_FULLY_REPLICATED,
MULTIPLE_ACTIVE_REGIONS,
REGIONS_CHANGED,
NOT_ENOUGH_WORKERS,
REGION_REPLICATION_MISMATCH,
DCID_MISSING,
LOCKED_NOT_NEW,
SUCCESS_WARN_PPW_GRADUAL,
SUCCESS,
};
enum class CoordinatorsResult {
INVALID_NETWORK_ADDRESSES,
SAME_NETWORK_ADDRESSES,
NOT_COORDINATORS, // FIXME: not detected
DATABASE_UNREACHABLE, // FIXME: not detected
BAD_DATABASE_STATE,
COORDINATOR_UNREACHABLE,
NOT_ENOUGH_MACHINES,
SUCCESS
};
struct ConfigureAutoResult {
std::map<NetworkAddress, ProcessClass> address_class;
int32_t processes;
int32_t machines;
std::string old_replication;
int32_t old_commit_proxies;
int32_t old_grv_proxies;
int32_t old_resolvers;
int32_t old_logs;
int32_t old_processes_with_transaction;
int32_t old_machines_with_transaction;
std::string auto_replication;
int32_t auto_commit_proxies;
int32_t auto_grv_proxies;
int32_t auto_resolvers;
int32_t auto_logs;
int32_t auto_processes_with_transaction;
int32_t auto_machines_with_transaction;
int32_t desired_commit_proxies;
int32_t desired_grv_proxies;
int32_t desired_resolvers;
int32_t desired_logs;
ConfigureAutoResult()
: processes(-1), machines(-1), old_commit_proxies(-1), old_grv_proxies(-1), old_resolvers(-1), old_logs(-1),
old_processes_with_transaction(-1), old_machines_with_transaction(-1), auto_commit_proxies(-1),
auto_grv_proxies(-1), auto_resolvers(-1), auto_logs(-1), auto_processes_with_transaction(-1),
auto_machines_with_transaction(-1), desired_commit_proxies(-1), desired_grv_proxies(-1), desired_resolvers(-1),
desired_logs(-1) {}
bool isValid() const { return processes != -1; }
};
ConfigurationResult buildConfiguration(
std::vector<StringRef> const& modeTokens,
std::map<std::string, std::string>& outConf); // Accepts a vector of configuration tokens
ConfigurationResult buildConfiguration(
std::string const& modeString,
std::map<std::string, std::string>& outConf); // Accepts tokens separated by spaces in a single string
bool isCompleteConfiguration(std::map<std::string, std::string> const& options);
ConfigureAutoResult parseConfig(StatusObject const& status);
// Management API written in template code to support both IClientAPI and NativeAPI
namespace ManagementAPI {
ACTOR template <class DB>
Future<Void> changeCachedRange(Reference<DB> db, KeyRangeRef range, bool add) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state KeyRange sysRange = KeyRangeRef(storageCacheKey(range.begin), storageCacheKey(range.end));
state KeyRange sysRangeClear = KeyRangeRef(storageCacheKey(range.begin), keyAfter(storageCacheKey(range.end)));
state KeyRange privateRange = KeyRangeRef(cacheKeysKey(0, range.begin), cacheKeysKey(0, range.end));
state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
loop {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr->clear(sysRangeClear);
tr->clear(privateRange);
tr->addReadConflictRange(privateRange);
// hold the returned standalone object's memory
state typename DB::TransactionT::template FutureT<RangeResult> previousFuture =
tr->getRange(KeyRangeRef(storageCachePrefix, sysRange.begin), 1, Snapshot::False, Reverse::True);
RangeResult previous = wait(safeThreadFutureToFuture(previousFuture));
bool prevIsCached = false;
if (!previous.empty()) {
std::vector<uint16_t> prevVal;
decodeStorageCacheValue(previous[0].value, prevVal);
prevIsCached = !prevVal.empty();
}
if (prevIsCached && !add) {
// we need to uncache from here
tr->set(sysRange.begin, falseValue);
tr->set(privateRange.begin, serverKeysFalse);
} else if (!prevIsCached && add) {
// we need to cache, starting from here
tr->set(sysRange.begin, trueValue);
tr->set(privateRange.begin, serverKeysTrue);
}
// hold the returned standalone object's memory
state typename DB::TransactionT::template FutureT<RangeResult> afterFuture =
tr->getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, Snapshot::False, Reverse::False);
RangeResult after = wait(safeThreadFutureToFuture(afterFuture));
bool afterIsCached = false;
if (!after.empty()) {
std::vector<uint16_t> afterVal;
decodeStorageCacheValue(after[0].value, afterVal);
afterIsCached = afterVal.empty();
}
if (afterIsCached && !add) {
tr->set(sysRange.end, trueValue);
tr->set(privateRange.end, serverKeysTrue);
} else if (!afterIsCached && add) {
tr->set(sysRange.end, falseValue);
tr->set(privateRange.end, serverKeysFalse);
}
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
state Error err = e;
wait(safeThreadFutureToFuture(tr->onError(e)));
TraceEvent(SevDebug, "ChangeCachedRangeError").error(err);
}
}
}
template <class DB>
Future<Void> addCachedRange(Reference<DB> db, KeyRangeRef range) {
return changeCachedRange(db, range, true);
}
template <class DB>
Future<Void> removeCachedRange(Reference<DB> db, KeyRangeRef range) {
return changeCachedRange(db, range, false);
}
ACTOR template <class Tr>
Future<std::vector<ProcessData>> getWorkers(Reference<Tr> tr,
typename Tr::template FutureT<RangeResult> processClassesF,
typename Tr::template FutureT<RangeResult> processDataF) {
// processClassesF and processDataF are used to hold standalone memory
processClassesF = tr->getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY);
processDataF = tr->getRange(workerListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> processClasses = safeThreadFutureToFuture(processClassesF);
state Future<RangeResult> processData = safeThreadFutureToFuture(processDataF);
wait(success(processClasses) && success(processData));
ASSERT(!processClasses.get().more && processClasses.get().size() < CLIENT_KNOBS->TOO_MANY);
ASSERT(!processData.get().more && processData.get().size() < CLIENT_KNOBS->TOO_MANY);
std::map<Optional<Standalone<StringRef>>, ProcessClass> id_class;
for (int i = 0; i < processClasses.get().size(); i++) {
id_class[decodeProcessClassKey(processClasses.get()[i].key)] =
decodeProcessClassValue(processClasses.get()[i].value);
}
std::vector<ProcessData> results;
for (int i = 0; i < processData.get().size(); i++) {
ProcessData data = decodeWorkerListValue(processData.get()[i].value);
ProcessClass processClass = id_class[data.locality.processId()];
if (processClass.classSource() == ProcessClass::DBSource ||
data.processClass.classType() == ProcessClass::UnsetClass)
data.processClass = processClass;
if (data.processClass.classType() != ProcessClass::TesterClass)
results.push_back(data);
}
return results;
}
// All versions of changeConfig apply the given set of configuration tokens to the database, and return a
// ConfigurationResult (or error).
// Accepts a full configuration in key/value format (from buildConfiguration)
ACTOR template <class DB>
Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string, std::string> m, bool force) {
state StringRef initIdKey = LiteralStringRef("\xff/init_id");
state Reference<typename DB::TransactionT> tr = db->createTransaction();
if (!m.size()) {
return ConfigurationResult::NO_OPTIONS_PROVIDED;
}
// make sure we have essential configuration options
std::string initKey = configKeysPrefix.toString() + "initialized";
state bool creating = m.count(initKey) != 0;
state Optional<UID> locked;
{
auto iter = m.find(databaseLockedKey.toString());
if (iter != m.end()) {
if (!creating) {
return ConfigurationResult::LOCKED_NOT_NEW;
}
locked = UID::fromString(iter->second);
m.erase(iter);
}
}
if (creating) {
m[initIdKey.toString()] = deterministicRandom()->randomUniqueID().toString();
if (!isCompleteConfiguration(m)) {
return ConfigurationResult::INCOMPLETE_CONFIGURATION;
}
}
state Future<Void> tooLong = delay(60);
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
state bool oldReplicationUsesDcId = false;
state bool warnPPWGradual = false;
state bool warnChangeStorageNoMigrate = false;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
if (!creating && !force) {
state typename DB::TransactionT::template FutureT<RangeResult> fConfigF =
tr->getRange(configKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fConfig = safeThreadFutureToFuture(fConfigF);
state typename DB::TransactionT::template FutureT<RangeResult> processClassesF;
state typename DB::TransactionT::template FutureT<RangeResult> processDataF;
state Future<std::vector<ProcessData>> fWorkers = getWorkers(tr, processClassesF, processDataF);
wait(success(fConfig) || tooLong);
if (!fConfig.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
if (fConfig.isReady()) {
ASSERT(fConfig.get().size() < CLIENT_KNOBS->TOO_MANY);
state DatabaseConfiguration oldConfig;
oldConfig.fromKeyValues((VectorRef<KeyValueRef>)fConfig.get());
state DatabaseConfiguration newConfig = oldConfig;
for (auto kv : m) {
newConfig.set(kv.first, kv.second);
}
if (!newConfig.isValid()) {
return ConfigurationResult::INVALID_CONFIGURATION;
}
if (newConfig.tLogPolicy->attributeKeys().count("dcid") && newConfig.regions.size() > 0) {
return ConfigurationResult::REGION_REPLICATION_MISMATCH;
}
oldReplicationUsesDcId =
oldReplicationUsesDcId || oldConfig.tLogPolicy->attributeKeys().count("dcid");
if (oldConfig.usableRegions != newConfig.usableRegions) {
// cannot change region configuration
std::map<Key, int32_t> dcId_priority;
for (auto& it : newConfig.regions) {
dcId_priority[it.dcId] = it.priority;
}
for (auto& it : oldConfig.regions) {
if (!dcId_priority.count(it.dcId) || dcId_priority[it.dcId] != it.priority) {
return ConfigurationResult::REGIONS_CHANGED;
}
}
// must only have one region with priority >= 0
int activeRegionCount = 0;
for (auto& it : newConfig.regions) {
if (it.priority >= 0) {
activeRegionCount++;
}
}
if (activeRegionCount > 1) {
return ConfigurationResult::MULTIPLE_ACTIVE_REGIONS;
}
}
state typename DB::TransactionT::template FutureT<RangeResult> fServerListF =
tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fServerList =
(newConfig.regions.size()) ? safeThreadFutureToFuture(fServerListF) : Future<RangeResult>();
if (newConfig.usableRegions == 2) {
if (oldReplicationUsesDcId) {
state typename DB::TransactionT::template FutureT<RangeResult> fLocalityListF =
tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fLocalityList = safeThreadFutureToFuture(fLocalityListF);
wait(success(fLocalityList) || tooLong);
if (!fLocalityList.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
RangeResult localityList = fLocalityList.get();
ASSERT(!localityList.more && localityList.size() < CLIENT_KNOBS->TOO_MANY);
std::set<Key> localityDcIds;
for (auto& s : localityList) {
auto dc = decodeTagLocalityListKey(s.key);
if (dc.present()) {
localityDcIds.insert(dc.get());
}
}
for (auto& it : newConfig.regions) {
if (localityDcIds.count(it.dcId) == 0) {
return ConfigurationResult::DCID_MISSING;
}
}
} else {
// all regions with priority >= 0 must be fully replicated
state std::vector<typename DB::TransactionT::template FutureT<Optional<Value>>>
replicasFuturesF;
state std::vector<Future<Optional<Value>>> replicasFutures;
for (auto& it : newConfig.regions) {
if (it.priority >= 0) {
replicasFuturesF.push_back(tr->get(datacenterReplicasKeyFor(it.dcId)));
replicasFutures.push_back(safeThreadFutureToFuture(replicasFuturesF.back()));
}
}
wait(waitForAll(replicasFutures) || tooLong);
for (auto& it : replicasFutures) {
if (!it.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
if (!it.get().present()) {
return ConfigurationResult::REGION_NOT_FULLY_REPLICATED;
}
}
}
}
if (newConfig.regions.size()) {
// all storage servers must be in one of the regions
wait(success(fServerList) || tooLong);
if (!fServerList.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
RangeResult serverList = fServerList.get();
ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
std::set<Key> newDcIds;
for (auto& it : newConfig.regions) {
newDcIds.insert(it.dcId);
}
std::set<Optional<Key>> missingDcIds;
for (auto& s : serverList) {
auto ssi = decodeServerListValue(s.value);
if (!ssi.locality.dcId().present() || !newDcIds.count(ssi.locality.dcId().get())) {
missingDcIds.insert(ssi.locality.dcId());
}
}
if (missingDcIds.size() > (oldReplicationUsesDcId ? 1 : 0)) {
return ConfigurationResult::STORAGE_IN_UNKNOWN_DCID;
}
}
wait(success(fWorkers) || tooLong);
if (!fWorkers.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
if (newConfig.regions.size()) {
std::map<Optional<Key>, std::set<Optional<Key>>> dcId_zoneIds;
for (auto& it : fWorkers.get()) {
if (it.processClass.machineClassFitness(ProcessClass::Storage) <= ProcessClass::WorstFit) {
dcId_zoneIds[it.locality.dcId()].insert(it.locality.zoneId());
}
}
for (auto& region : newConfig.regions) {
if (dcId_zoneIds[region.dcId].size() <
std::max(newConfig.storageTeamSize, newConfig.tLogReplicationFactor)) {
return ConfigurationResult::NOT_ENOUGH_WORKERS;
}
if (region.satelliteTLogReplicationFactor > 0 && region.priority >= 0) {
int totalSatelliteProcesses = 0;
for (auto& sat : region.satellites) {
totalSatelliteProcesses += dcId_zoneIds[sat.dcId].size();
}
if (totalSatelliteProcesses < region.satelliteTLogReplicationFactor) {
return ConfigurationResult::NOT_ENOUGH_WORKERS;
}
}
}
} else {
std::set<Optional<Key>> zoneIds;
for (auto& it : fWorkers.get()) {
if (it.processClass.machineClassFitness(ProcessClass::Storage) <= ProcessClass::WorstFit) {
zoneIds.insert(it.locality.zoneId());
}
}
if (zoneIds.size() < std::max(newConfig.storageTeamSize, newConfig.tLogReplicationFactor)) {
return ConfigurationResult::NOT_ENOUGH_WORKERS;
}
}
if (newConfig.storageServerStoreType != oldConfig.storageServerStoreType &&
newConfig.storageMigrationType == StorageMigrationType::DISABLED) {
return ConfigurationResult::STORAGE_MIGRATION_DISABLED;
} else if (newConfig.storageMigrationType == StorageMigrationType::GRADUAL &&
newConfig.perpetualStorageWiggleSpeed == 0) {
warnPPWGradual = true;
}
}
}
if (creating) {
tr->setOption(FDBTransactionOptions::INITIALIZE_NEW_DATABASE);
tr->addReadConflictRange(singleKeyRange(initIdKey));
} else if (m.size()) {
// might be used in an emergency transaction, so make sure it is retry-self-conflicting and
// CAUSAL_WRITE_RISKY
tr->setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY);
tr->addReadConflictRange(singleKeyRange(m.begin()->first));
}
if (locked.present()) {
ASSERT(creating);
tr->atomicOp(databaseLockedKey,
BinaryWriter::toValue(locked.get(), Unversioned())
.withPrefix(LiteralStringRef("0123456789"))
.withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
MutationRef::SetVersionstampedValue);
}
for (auto i = m.begin(); i != m.end(); ++i) {
tr->set(StringRef(i->first), StringRef(i->second));
}
tr->addReadConflictRange(singleKeyRange(moveKeysLockOwnerKey));
tr->set(moveKeysLockOwnerKey, versionKey);
wait(safeThreadFutureToFuture(tr->commit()));
break;
} catch (Error& e) {
state Error e1(e);
if ((e.code() == error_code_not_committed || e.code() == error_code_transaction_too_old) && creating) {
// The database now exists. Determine whether we created it or it was already existing/created by
// someone else. The latter is an error.
tr->reset();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
state typename DB::TransactionT::template FutureT<Optional<Value>> vF = tr->get(initIdKey);
Optional<Value> v = wait(safeThreadFutureToFuture(vF));
if (v != m[initIdKey.toString()])
return ConfigurationResult::DATABASE_ALREADY_CREATED;
else
return ConfigurationResult::DATABASE_CREATED;
} catch (Error& e2) {
wait(safeThreadFutureToFuture(tr->onError(e2)));
}
}
}
wait(safeThreadFutureToFuture(tr->onError(e1)));
}
}
if (warnPPWGradual) {
return ConfigurationResult::SUCCESS_WARN_PPW_GRADUAL;
} else {
return ConfigurationResult::SUCCESS;
}
}
ACTOR template <class DB>
Future<ConfigurationResult> autoConfig(Reference<DB> db, ConfigureAutoResult conf) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
if (!conf.address_class.size())
return ConfigurationResult::INCOMPLETE_CONFIGURATION; // FIXME: correct return type
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
state typename DB::TransactionT::template FutureT<RangeResult> processClassesF;
state typename DB::TransactionT::template FutureT<RangeResult> processDataF;
std::vector<ProcessData> workers = wait(getWorkers(tr, processClassesF, processDataF));
std::map<NetworkAddress, Optional<Standalone<StringRef>>> address_processId;
for (auto& w : workers) {
address_processId[w.address] = w.locality.processId();
}
for (auto& it : conf.address_class) {
if (it.second.classSource() == ProcessClass::CommandLineSource) {
tr->clear(processClassKeyFor(address_processId[it.first].get()));
} else {
tr->set(processClassKeyFor(address_processId[it.first].get()), processClassValue(it.second));
}
}
if (conf.address_class.size())
tr->set(processClassChangeKey, deterministicRandom()->randomUniqueID().toString());
if (conf.auto_logs != conf.old_logs)
tr->set(configKeysPrefix.toString() + "auto_logs", format("%d", conf.auto_logs));
if (conf.auto_commit_proxies != conf.old_commit_proxies)
tr->set(configKeysPrefix.toString() + "auto_commit_proxies", format("%d", conf.auto_commit_proxies));
if (conf.auto_grv_proxies != conf.old_grv_proxies)
tr->set(configKeysPrefix.toString() + "auto_grv_proxies", format("%d", conf.auto_grv_proxies));
if (conf.auto_resolvers != conf.old_resolvers)
tr->set(configKeysPrefix.toString() + "auto_resolvers", format("%d", conf.auto_resolvers));
if (conf.auto_replication != conf.old_replication) {
std::vector<StringRef> modes;
modes.push_back(conf.auto_replication);
std::map<std::string, std::string> m;
auto r = buildConfiguration(modes, m);
if (r != ConfigurationResult::SUCCESS)
return r;
for (auto& kv : m)
tr->set(kv.first, kv.second);
}
tr->addReadConflictRange(singleKeyRange(moveKeysLockOwnerKey));
tr->set(moveKeysLockOwnerKey, versionKey);
wait(safeThreadFutureToFuture(tr->commit()));
return ConfigurationResult::SUCCESS;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
// Accepts tokens separated by spaces in a single string
template <class DB>
Future<ConfigurationResult> changeConfig(Reference<DB> db, std::string const& modes, bool force) {
TraceEvent("ChangeConfig").detail("Mode", modes);
std::map<std::string, std::string> m;
auto r = buildConfiguration(modes, m);
if (r != ConfigurationResult::SUCCESS)
return r;
return changeConfig(db, m, force);
}
// Accepts a vector of configuration tokens
template <class DB>
Future<ConfigurationResult> changeConfig(Reference<DB> db,
std::vector<StringRef> const& modes,
Optional<ConfigureAutoResult> const& conf,
bool force) {
if (modes.size() && modes[0] == LiteralStringRef("auto") && conf.present()) {
return autoConfig(db, conf.get());
}
std::map<std::string, std::string> m;
auto r = buildConfiguration(modes, m);
if (r != ConfigurationResult::SUCCESS)
return r;
return changeConfig(db, m, force);
}
// return the corresponding error message for the CoordinatorsResult
// used by special keys and fdbcli
std::string generateErrorMessage(const CoordinatorsResult& res);
} // namespace ManagementAPI
#include "flow/unactorcompiler.h"
#endif

View File

@ -183,6 +183,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
self->erase(KeyRangeRef(""_sr, "\xff"_sr));
Transaction tr(self->cx);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
RangeResult result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
for (const auto& kv : result) {
KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix);

View File

@ -26,6 +26,24 @@
namespace HTTP {
// AWS V4 headers require this encoding for its signature calculation
std::string awsV4URIEncode(const std::string& s, bool encodeSlash) {
std::string o;
o.reserve(s.size() * 3);
char buf[4];
for (auto c : s) {
if (std::isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~')
o.append(&c, 1);
else if (c == '/')
o.append(encodeSlash ? "%2F" : "/");
else {
sprintf(buf, "%%%.02X", c);
o.append(buf);
}
}
return o;
}
std::string urlEncode(const std::string& s) {
std::string o;
o.reserve(s.size() * 3);

View File

@ -31,6 +31,7 @@ struct is_iless {
typedef std::map<std::string, std::string, is_iless> Headers;
std::string urlEncode(const std::string& s);
std::string awsV4URIEncode(const std::string& s, bool encodeSlash);
struct Response : ReferenceCounted<Response> {
Response() {}

View File

@ -0,0 +1,141 @@
/*
* HighContentionPrefixAllocator.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version.
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_HIGHCONTENTIONPREFIXALLOCATOR_ACTOR_G_H)
#define FDBCLIENT_HIGHCONTENTIONPREFIXALLOCATOR_ACTOR_G_H
#include "fdbclient/HighContentionPrefixAllocator.actor.g.h"
#elif !defined(FDBCLIENT_HIGHCONTENTIONPREFIXALLOCATOR_ACTOR_H)
#define FDBCLIENT_HIGHCONTENTIONPREFIXALLOCATOR_ACTOR_H
#include "fdbclient/ClientBooleanParams.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/Subspace.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // This must be the last #include.
class HighContentionPrefixAllocator {
public:
HighContentionPrefixAllocator(Subspace subspace) : counters(subspace.get(0)), recent(subspace.get(1)) {}
template <class TransactionT>
Future<Standalone<StringRef>> allocate(Reference<TransactionT> tr) {
return allocate(this, tr);
}
static int64_t windowSize(int64_t start) {
if (start < 255) {
return 64;
}
if (start < 65535) {
return 1024;
}
return 8192;
}
private:
Subspace counters;
Subspace recent;
ACTOR template <class TransactionT>
Future<Standalone<StringRef>> allocate(HighContentionPrefixAllocator* self, Reference<TransactionT> tr) {
state int64_t start = 0;
state int64_t window = 0;
loop {
RangeResult range =
wait(safeThreadFutureToFuture(tr->getRange(self->counters.range(), 1, Snapshot::True, Reverse::True)));
if (range.size() > 0) {
start = self->counters.unpack(range[0].key).getInt(0);
}
state bool windowAdvanced = false;
loop {
// if thread safety is needed, this should be locked {
if (windowAdvanced) {
tr->clear(KeyRangeRef(self->counters.key(), self->counters.get(start).key()));
tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
tr->clear(KeyRangeRef(self->recent.key(), self->recent.get(start).key()));
}
int64_t inc = 1;
tr->atomicOp(self->counters.get(start).key(), StringRef((uint8_t*)&inc, 8), MutationRef::AddValue);
Future<Optional<Value>> countFuture =
safeThreadFutureToFuture(tr->get(self->counters.get(start).key(), Snapshot::True));
// }
Optional<Value> countValue = wait(countFuture);
int64_t count = 0;
if (countValue.present()) {
if (countValue.get().size() != 8) {
throw invalid_directory_layer_metadata();
}
count = *(int64_t*)countValue.get().begin();
}
window = HighContentionPrefixAllocator::windowSize(start);
if (count * 2 < window) {
break;
}
start += window;
windowAdvanced = true;
}
loop {
state int64_t candidate = deterministicRandom()->randomInt(start, start + window);
// if thread safety is needed, this should be locked {
state Future<RangeResult> latestCounterFuture =
safeThreadFutureToFuture(tr->getRange(self->counters.range(), 1, Snapshot::True, Reverse::True));
state Future<Optional<Value>> candidateValueFuture =
safeThreadFutureToFuture(tr->get(self->recent.get(candidate).key()));
tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
tr->set(self->recent.get(candidate).key(), ValueRef());
// }
wait(success(latestCounterFuture) && success(candidateValueFuture));
int64_t currentWindowStart = 0;
if (latestCounterFuture.get().size() > 0) {
currentWindowStart = self->counters.unpack(latestCounterFuture.get()[0].key).getInt(0);
}
if (currentWindowStart > start) {
break;
}
if (!candidateValueFuture.get().present()) {
tr->addWriteConflictRange(singleKeyRange(self->recent.get(candidate).key()));
return Tuple().append(candidate).pack();
}
}
}
}
};
#include "flow/unactorcompiler.h"
#endif

View File

@ -466,6 +466,10 @@ public:
return k.expectedSize() + v.expectedSize();
}
Key serializeKey(KeyType const& key) { return space.pack(Codec<KeyType>::pack(key)); }
Value serializeValue(ValueType const& val) { return ObjectWriter::toValue(val, versionOptions); }
void erase(Reference<ReadYourWritesTransaction> tr, KeyType const& key) {
return tr->clear(space.pack(Codec<KeyType>::pack(key)));
}

View File

@ -425,7 +425,8 @@ ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Database cx) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
RangeResult res = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY);
DatabaseConfiguration config;
@ -756,7 +757,8 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> currentKey = wait(tr.get(coordinatorsKey));
if (!currentKey.present())
return std::vector<NetworkAddress>();
@ -772,6 +774,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
Reference<IQuorumChange> change,
std::vector<NetworkAddress>* desiredCoordinators) {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> currentKey = wait(tr->get(coordinatorsKey));
@ -861,6 +864,7 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> currentKey = wait(tr.get(coordinatorsKey));
@ -1680,7 +1684,8 @@ ACTOR Future<Void> printHealthyZone(Database cx) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(tr.get(healthyZoneKey));
if (val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) {
printf("Data distribution has been disabled for all storage server failures in this cluster and thus "
@ -1706,6 +1711,7 @@ ACTOR Future<bool> clearHealthyZone(Database cx, bool printWarning, bool clearSS
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(tr.get(healthyZoneKey));
if (!clearSSFailureZoneString && val.present() &&
@ -1732,6 +1738,7 @@ ACTOR Future<bool> setHealthyZone(Database cx, StringRef zoneId, double seconds,
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(tr.get(healthyZoneKey));
if (val.present() && decodeHealthyZoneValue(val.get()).first == ignoreSSFailuresZoneString) {
@ -1757,6 +1764,7 @@ ACTOR Future<Void> setDDIgnoreRebalanceSwitch(Database cx, bool ignoreRebalance)
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
if (ignoreRebalance) {
tr.set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
} else {
@ -1778,6 +1786,8 @@ ACTOR Future<int> setDDMode(Database cx, int mode) {
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> old = wait(tr.get(dataDistributionModeKey));
if (oldMode < 0) {
oldMode = 1;

View File

@ -34,95 +34,13 @@ standard API and some knowledge of the contents of the system key space.
#include <string>
#include <map>
#include "fdbclient/GenericManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Status.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/MonitorLeader.h"
#include "flow/actorcompiler.h" // has to be last include
// ConfigurationResult enumerates normal outcomes of changeConfig() and various error
// conditions specific to it. changeConfig may also throw an Error to report other problems.
enum class ConfigurationResult {
NO_OPTIONS_PROVIDED,
CONFLICTING_OPTIONS,
UNKNOWN_OPTION,
INCOMPLETE_CONFIGURATION,
INVALID_CONFIGURATION,
STORAGE_MIGRATION_DISABLED,
DATABASE_ALREADY_CREATED,
DATABASE_CREATED,
DATABASE_UNAVAILABLE,
STORAGE_IN_UNKNOWN_DCID,
REGION_NOT_FULLY_REPLICATED,
MULTIPLE_ACTIVE_REGIONS,
REGIONS_CHANGED,
NOT_ENOUGH_WORKERS,
REGION_REPLICATION_MISMATCH,
DCID_MISSING,
LOCKED_NOT_NEW,
SUCCESS_WARN_PPW_GRADUAL,
SUCCESS,
};
enum class CoordinatorsResult {
INVALID_NETWORK_ADDRESSES,
SAME_NETWORK_ADDRESSES,
NOT_COORDINATORS, // FIXME: not detected
DATABASE_UNREACHABLE, // FIXME: not detected
BAD_DATABASE_STATE,
COORDINATOR_UNREACHABLE,
NOT_ENOUGH_MACHINES,
SUCCESS
};
struct ConfigureAutoResult {
std::map<NetworkAddress, ProcessClass> address_class;
int32_t processes;
int32_t machines;
std::string old_replication;
int32_t old_commit_proxies;
int32_t old_grv_proxies;
int32_t old_resolvers;
int32_t old_logs;
int32_t old_processes_with_transaction;
int32_t old_machines_with_transaction;
std::string auto_replication;
int32_t auto_commit_proxies;
int32_t auto_grv_proxies;
int32_t auto_resolvers;
int32_t auto_logs;
int32_t auto_processes_with_transaction;
int32_t auto_machines_with_transaction;
int32_t desired_commit_proxies;
int32_t desired_grv_proxies;
int32_t desired_resolvers;
int32_t desired_logs;
ConfigureAutoResult()
: processes(-1), machines(-1), old_commit_proxies(-1), old_grv_proxies(-1), old_resolvers(-1), old_logs(-1),
old_processes_with_transaction(-1), old_machines_with_transaction(-1), auto_commit_proxies(-1),
auto_grv_proxies(-1), auto_resolvers(-1), auto_logs(-1), auto_processes_with_transaction(-1),
auto_machines_with_transaction(-1), desired_commit_proxies(-1), desired_grv_proxies(-1), desired_resolvers(-1),
desired_logs(-1) {}
bool isValid() const { return processes != -1; }
};
ConfigurationResult buildConfiguration(
std::vector<StringRef> const& modeTokens,
std::map<std::string, std::string>& outConf); // Accepts a vector of configuration tokens
ConfigurationResult buildConfiguration(
std::string const& modeString,
std::map<std::string, std::string>& outConf); // Accepts tokens separated by spaces in a single string
bool isCompleteConfiguration(std::map<std::string, std::string> const& options);
ConfigureAutoResult parseConfig(StatusObject const& status);
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Database cx);
ACTOR Future<Void> waitForFullReplication(Database cx);
@ -243,511 +161,5 @@ bool schemaMatch(json_spirit::mValue const& schema,
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Management API written in template code to support both IClientAPI and NativeAPI
namespace ManagementAPI {
ACTOR template <class DB>
Future<Void> changeCachedRange(Reference<DB> db, KeyRangeRef range, bool add) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state KeyRange sysRange = KeyRangeRef(storageCacheKey(range.begin), storageCacheKey(range.end));
state KeyRange sysRangeClear = KeyRangeRef(storageCacheKey(range.begin), keyAfter(storageCacheKey(range.end)));
state KeyRange privateRange = KeyRangeRef(cacheKeysKey(0, range.begin), cacheKeysKey(0, range.end));
state Value trueValue = storageCacheValue(std::vector<uint16_t>{ 0 });
state Value falseValue = storageCacheValue(std::vector<uint16_t>{});
loop {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr->clear(sysRangeClear);
tr->clear(privateRange);
tr->addReadConflictRange(privateRange);
// hold the returned standalone object's memory
state typename DB::TransactionT::template FutureT<RangeResult> previousFuture =
tr->getRange(KeyRangeRef(storageCachePrefix, sysRange.begin), 1, Snapshot::False, Reverse::True);
RangeResult previous = wait(safeThreadFutureToFuture(previousFuture));
bool prevIsCached = false;
if (!previous.empty()) {
std::vector<uint16_t> prevVal;
decodeStorageCacheValue(previous[0].value, prevVal);
prevIsCached = !prevVal.empty();
}
if (prevIsCached && !add) {
// we need to uncache from here
tr->set(sysRange.begin, falseValue);
tr->set(privateRange.begin, serverKeysFalse);
} else if (!prevIsCached && add) {
// we need to cache, starting from here
tr->set(sysRange.begin, trueValue);
tr->set(privateRange.begin, serverKeysTrue);
}
// hold the returned standalone object's memory
state typename DB::TransactionT::template FutureT<RangeResult> afterFuture =
tr->getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, Snapshot::False, Reverse::False);
RangeResult after = wait(safeThreadFutureToFuture(afterFuture));
bool afterIsCached = false;
if (!after.empty()) {
std::vector<uint16_t> afterVal;
decodeStorageCacheValue(after[0].value, afterVal);
afterIsCached = afterVal.empty();
}
if (afterIsCached && !add) {
tr->set(sysRange.end, trueValue);
tr->set(privateRange.end, serverKeysTrue);
} else if (!afterIsCached && add) {
tr->set(sysRange.end, falseValue);
tr->set(privateRange.end, serverKeysFalse);
}
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
state Error err = e;
wait(safeThreadFutureToFuture(tr->onError(e)));
TraceEvent(SevDebug, "ChangeCachedRangeError").error(err);
}
}
}
template <class DB>
Future<Void> addCachedRange(Reference<DB> db, KeyRangeRef range) {
return changeCachedRange(db, range, true);
}
template <class DB>
Future<Void> removeCachedRange(Reference<DB> db, KeyRangeRef range) {
return changeCachedRange(db, range, false);
}
ACTOR template <class Tr>
Future<std::vector<ProcessData>> getWorkers(Reference<Tr> tr,
typename Tr::template FutureT<RangeResult> processClassesF,
typename Tr::template FutureT<RangeResult> processDataF) {
// processClassesF and processDataF are used to hold standalone memory
processClassesF = tr->getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY);
processDataF = tr->getRange(workerListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> processClasses = safeThreadFutureToFuture(processClassesF);
state Future<RangeResult> processData = safeThreadFutureToFuture(processDataF);
wait(success(processClasses) && success(processData));
ASSERT(!processClasses.get().more && processClasses.get().size() < CLIENT_KNOBS->TOO_MANY);
ASSERT(!processData.get().more && processData.get().size() < CLIENT_KNOBS->TOO_MANY);
std::map<Optional<Standalone<StringRef>>, ProcessClass> id_class;
for (int i = 0; i < processClasses.get().size(); i++) {
id_class[decodeProcessClassKey(processClasses.get()[i].key)] =
decodeProcessClassValue(processClasses.get()[i].value);
}
std::vector<ProcessData> results;
for (int i = 0; i < processData.get().size(); i++) {
ProcessData data = decodeWorkerListValue(processData.get()[i].value);
ProcessClass processClass = id_class[data.locality.processId()];
if (processClass.classSource() == ProcessClass::DBSource ||
data.processClass.classType() == ProcessClass::UnsetClass)
data.processClass = processClass;
if (data.processClass.classType() != ProcessClass::TesterClass)
results.push_back(data);
}
return results;
}
// All versions of changeConfig apply the given set of configuration tokens to the database, and return a
// ConfigurationResult (or error).
// Accepts a full configuration in key/value format (from buildConfiguration)
ACTOR template <class DB>
Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string, std::string> m, bool force) {
state StringRef initIdKey = LiteralStringRef("\xff/init_id");
state Reference<typename DB::TransactionT> tr = db->createTransaction();
if (!m.size()) {
return ConfigurationResult::NO_OPTIONS_PROVIDED;
}
// make sure we have essential configuration options
std::string initKey = configKeysPrefix.toString() + "initialized";
state bool creating = m.count(initKey) != 0;
state Optional<UID> locked;
{
auto iter = m.find(databaseLockedKey.toString());
if (iter != m.end()) {
if (!creating) {
return ConfigurationResult::LOCKED_NOT_NEW;
}
locked = UID::fromString(iter->second);
m.erase(iter);
}
}
if (creating) {
m[initIdKey.toString()] = deterministicRandom()->randomUniqueID().toString();
if (!isCompleteConfiguration(m)) {
return ConfigurationResult::INCOMPLETE_CONFIGURATION;
}
}
state Future<Void> tooLong = delay(60);
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
state bool oldReplicationUsesDcId = false;
state bool warnPPWGradual = false;
state bool warnChangeStorageNoMigrate = false;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
if (!creating && !force) {
state typename DB::TransactionT::template FutureT<RangeResult> fConfigF =
tr->getRange(configKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fConfig = safeThreadFutureToFuture(fConfigF);
state typename DB::TransactionT::template FutureT<RangeResult> processClassesF;
state typename DB::TransactionT::template FutureT<RangeResult> processDataF;
state Future<std::vector<ProcessData>> fWorkers = getWorkers(tr, processClassesF, processDataF);
wait(success(fConfig) || tooLong);
if (!fConfig.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
if (fConfig.isReady()) {
ASSERT(fConfig.get().size() < CLIENT_KNOBS->TOO_MANY);
state DatabaseConfiguration oldConfig;
oldConfig.fromKeyValues((VectorRef<KeyValueRef>)fConfig.get());
state DatabaseConfiguration newConfig = oldConfig;
for (auto kv : m) {
newConfig.set(kv.first, kv.second);
}
if (!newConfig.isValid()) {
return ConfigurationResult::INVALID_CONFIGURATION;
}
if (newConfig.tLogPolicy->attributeKeys().count("dcid") && newConfig.regions.size() > 0) {
return ConfigurationResult::REGION_REPLICATION_MISMATCH;
}
oldReplicationUsesDcId =
oldReplicationUsesDcId || oldConfig.tLogPolicy->attributeKeys().count("dcid");
if (oldConfig.usableRegions != newConfig.usableRegions) {
// cannot change region configuration
std::map<Key, int32_t> dcId_priority;
for (auto& it : newConfig.regions) {
dcId_priority[it.dcId] = it.priority;
}
for (auto& it : oldConfig.regions) {
if (!dcId_priority.count(it.dcId) || dcId_priority[it.dcId] != it.priority) {
return ConfigurationResult::REGIONS_CHANGED;
}
}
// must only have one region with priority >= 0
int activeRegionCount = 0;
for (auto& it : newConfig.regions) {
if (it.priority >= 0) {
activeRegionCount++;
}
}
if (activeRegionCount > 1) {
return ConfigurationResult::MULTIPLE_ACTIVE_REGIONS;
}
}
state typename DB::TransactionT::template FutureT<RangeResult> fServerListF =
tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fServerList =
(newConfig.regions.size()) ? safeThreadFutureToFuture(fServerListF) : Future<RangeResult>();
if (newConfig.usableRegions == 2) {
if (oldReplicationUsesDcId) {
state typename DB::TransactionT::template FutureT<RangeResult> fLocalityListF =
tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fLocalityList = safeThreadFutureToFuture(fLocalityListF);
wait(success(fLocalityList) || tooLong);
if (!fLocalityList.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
RangeResult localityList = fLocalityList.get();
ASSERT(!localityList.more && localityList.size() < CLIENT_KNOBS->TOO_MANY);
std::set<Key> localityDcIds;
for (auto& s : localityList) {
auto dc = decodeTagLocalityListKey(s.key);
if (dc.present()) {
localityDcIds.insert(dc.get());
}
}
for (auto& it : newConfig.regions) {
if (localityDcIds.count(it.dcId) == 0) {
return ConfigurationResult::DCID_MISSING;
}
}
} else {
// all regions with priority >= 0 must be fully replicated
state std::vector<typename DB::TransactionT::template FutureT<Optional<Value>>>
replicasFuturesF;
state std::vector<Future<Optional<Value>>> replicasFutures;
for (auto& it : newConfig.regions) {
if (it.priority >= 0) {
replicasFuturesF.push_back(tr->get(datacenterReplicasKeyFor(it.dcId)));
replicasFutures.push_back(safeThreadFutureToFuture(replicasFuturesF.back()));
}
}
wait(waitForAll(replicasFutures) || tooLong);
for (auto& it : replicasFutures) {
if (!it.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
if (!it.get().present()) {
return ConfigurationResult::REGION_NOT_FULLY_REPLICATED;
}
}
}
}
if (newConfig.regions.size()) {
// all storage servers must be in one of the regions
wait(success(fServerList) || tooLong);
if (!fServerList.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
RangeResult serverList = fServerList.get();
ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
std::set<Key> newDcIds;
for (auto& it : newConfig.regions) {
newDcIds.insert(it.dcId);
}
std::set<Optional<Key>> missingDcIds;
for (auto& s : serverList) {
auto ssi = decodeServerListValue(s.value);
if (!ssi.locality.dcId().present() || !newDcIds.count(ssi.locality.dcId().get())) {
missingDcIds.insert(ssi.locality.dcId());
}
}
if (missingDcIds.size() > (oldReplicationUsesDcId ? 1 : 0)) {
return ConfigurationResult::STORAGE_IN_UNKNOWN_DCID;
}
}
wait(success(fWorkers) || tooLong);
if (!fWorkers.isReady()) {
return ConfigurationResult::DATABASE_UNAVAILABLE;
}
if (newConfig.regions.size()) {
std::map<Optional<Key>, std::set<Optional<Key>>> dcId_zoneIds;
for (auto& it : fWorkers.get()) {
if (it.processClass.machineClassFitness(ProcessClass::Storage) <= ProcessClass::WorstFit) {
dcId_zoneIds[it.locality.dcId()].insert(it.locality.zoneId());
}
}
for (auto& region : newConfig.regions) {
if (dcId_zoneIds[region.dcId].size() <
std::max(newConfig.storageTeamSize, newConfig.tLogReplicationFactor)) {
return ConfigurationResult::NOT_ENOUGH_WORKERS;
}
if (region.satelliteTLogReplicationFactor > 0 && region.priority >= 0) {
int totalSatelliteProcesses = 0;
for (auto& sat : region.satellites) {
totalSatelliteProcesses += dcId_zoneIds[sat.dcId].size();
}
if (totalSatelliteProcesses < region.satelliteTLogReplicationFactor) {
return ConfigurationResult::NOT_ENOUGH_WORKERS;
}
}
}
} else {
std::set<Optional<Key>> zoneIds;
for (auto& it : fWorkers.get()) {
if (it.processClass.machineClassFitness(ProcessClass::Storage) <= ProcessClass::WorstFit) {
zoneIds.insert(it.locality.zoneId());
}
}
if (zoneIds.size() < std::max(newConfig.storageTeamSize, newConfig.tLogReplicationFactor)) {
return ConfigurationResult::NOT_ENOUGH_WORKERS;
}
}
if (newConfig.storageServerStoreType != oldConfig.storageServerStoreType &&
newConfig.storageMigrationType == StorageMigrationType::DISABLED) {
return ConfigurationResult::STORAGE_MIGRATION_DISABLED;
} else if (newConfig.storageMigrationType == StorageMigrationType::GRADUAL &&
newConfig.perpetualStorageWiggleSpeed == 0) {
warnPPWGradual = true;
}
}
}
if (creating) {
tr->setOption(FDBTransactionOptions::INITIALIZE_NEW_DATABASE);
tr->addReadConflictRange(singleKeyRange(initIdKey));
} else if (m.size()) {
// might be used in an emergency transaction, so make sure it is retry-self-conflicting and
// CAUSAL_WRITE_RISKY
tr->setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY);
tr->addReadConflictRange(singleKeyRange(m.begin()->first));
}
if (locked.present()) {
ASSERT(creating);
tr->atomicOp(databaseLockedKey,
BinaryWriter::toValue(locked.get(), Unversioned())
.withPrefix(LiteralStringRef("0123456789"))
.withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
MutationRef::SetVersionstampedValue);
}
for (auto i = m.begin(); i != m.end(); ++i) {
tr->set(StringRef(i->first), StringRef(i->second));
}
tr->addReadConflictRange(singleKeyRange(moveKeysLockOwnerKey));
tr->set(moveKeysLockOwnerKey, versionKey);
wait(safeThreadFutureToFuture(tr->commit()));
break;
} catch (Error& e) {
state Error e1(e);
if ((e.code() == error_code_not_committed || e.code() == error_code_transaction_too_old) && creating) {
// The database now exists. Determine whether we created it or it was already existing/created by
// someone else. The latter is an error.
tr->reset();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
state typename DB::TransactionT::template FutureT<Optional<Value>> vF = tr->get(initIdKey);
Optional<Value> v = wait(safeThreadFutureToFuture(vF));
if (v != m[initIdKey.toString()])
return ConfigurationResult::DATABASE_ALREADY_CREATED;
else
return ConfigurationResult::DATABASE_CREATED;
} catch (Error& e2) {
wait(safeThreadFutureToFuture(tr->onError(e2)));
}
}
}
wait(safeThreadFutureToFuture(tr->onError(e1)));
}
}
if (warnPPWGradual) {
return ConfigurationResult::SUCCESS_WARN_PPW_GRADUAL;
} else {
return ConfigurationResult::SUCCESS;
}
}
ACTOR template <class DB>
Future<ConfigurationResult> autoConfig(Reference<DB> db, ConfigureAutoResult conf) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
if (!conf.address_class.size())
return ConfigurationResult::INCOMPLETE_CONFIGURATION; // FIXME: correct return type
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
state typename DB::TransactionT::template FutureT<RangeResult> processClassesF;
state typename DB::TransactionT::template FutureT<RangeResult> processDataF;
std::vector<ProcessData> workers = wait(getWorkers(tr, processClassesF, processDataF));
std::map<NetworkAddress, Optional<Standalone<StringRef>>> address_processId;
for (auto& w : workers) {
address_processId[w.address] = w.locality.processId();
}
for (auto& it : conf.address_class) {
if (it.second.classSource() == ProcessClass::CommandLineSource) {
tr->clear(processClassKeyFor(address_processId[it.first].get()));
} else {
tr->set(processClassKeyFor(address_processId[it.first].get()), processClassValue(it.second));
}
}
if (conf.address_class.size())
tr->set(processClassChangeKey, deterministicRandom()->randomUniqueID().toString());
if (conf.auto_logs != conf.old_logs)
tr->set(configKeysPrefix.toString() + "auto_logs", format("%d", conf.auto_logs));
if (conf.auto_commit_proxies != conf.old_commit_proxies)
tr->set(configKeysPrefix.toString() + "auto_commit_proxies", format("%d", conf.auto_commit_proxies));
if (conf.auto_grv_proxies != conf.old_grv_proxies)
tr->set(configKeysPrefix.toString() + "auto_grv_proxies", format("%d", conf.auto_grv_proxies));
if (conf.auto_resolvers != conf.old_resolvers)
tr->set(configKeysPrefix.toString() + "auto_resolvers", format("%d", conf.auto_resolvers));
if (conf.auto_replication != conf.old_replication) {
std::vector<StringRef> modes;
modes.push_back(conf.auto_replication);
std::map<std::string, std::string> m;
auto r = buildConfiguration(modes, m);
if (r != ConfigurationResult::SUCCESS)
return r;
for (auto& kv : m)
tr->set(kv.first, kv.second);
}
tr->addReadConflictRange(singleKeyRange(moveKeysLockOwnerKey));
tr->set(moveKeysLockOwnerKey, versionKey);
wait(safeThreadFutureToFuture(tr->commit()));
return ConfigurationResult::SUCCESS;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
// Accepts tokens separated by spaces in a single string
template <class DB>
Future<ConfigurationResult> changeConfig(Reference<DB> db, std::string const& modes, bool force) {
TraceEvent("ChangeConfig").detail("Mode", modes);
std::map<std::string, std::string> m;
auto r = buildConfiguration(modes, m);
if (r != ConfigurationResult::SUCCESS)
return r;
return changeConfig(db, m, force);
}
// Accepts a vector of configuration tokens
template <class DB>
Future<ConfigurationResult> changeConfig(Reference<DB> db,
std::vector<StringRef> const& modes,
Optional<ConfigureAutoResult> const& conf,
bool force) {
if (modes.size() && modes[0] == LiteralStringRef("auto") && conf.present()) {
return autoConfig(db, conf.get());
}
std::map<std::string, std::string> m;
auto r = buildConfiguration(modes, m);
if (r != ConfigurationResult::SUCCESS)
return r;
return changeConfig(db, m, force);
}
// return the corresponding error message for the CoordinatorsResult
// used by special keys and fdbcli
std::string generateErrorMessage(const CoordinatorsResult& res);
} // namespace ManagementAPI
#include "flow/unactorcompiler.h"
#endif

View File

@ -54,7 +54,7 @@ FDB_DEFINE_BOOLEAN_PARAM(ConnectionStringNeedsPersisted);
// Returns the connection string currently held in this object. This may not match the stored record if it hasn't
// been persisted or if the persistent storage for the record has been modified externally.
ClusterConnectionString const& IClusterConnectionRecord::getConnectionString() const {
ClusterConnectionString& IClusterConnectionRecord::getConnectionString() {
return cs;
}
@ -77,6 +77,18 @@ void IClusterConnectionRecord::setPersisted() {
connectionStringNeedsPersisted = false;
}
bool IClusterConnectionRecord::hasUnresolvedHostnames() const {
return cs.hasUnresolvedHostnames;
}
Future<Void> IClusterConnectionRecord::resolveHostnames() {
return cs.resolveHostnames();
}
void IClusterConnectionRecord::resolveHostnamesBlocking() {
cs.resolveHostnamesBlocking();
}
std::string ClusterConnectionString::getErrorString(std::string const& source, Error const& e) {
if (e.code() == error_code_connection_string_invalid) {
return format("Invalid connection string `%s: %d %s", source.c_str(), e.code(), e.what());
@ -85,28 +97,109 @@ std::string ClusterConnectionString::getErrorString(std::string const& source, E
}
}
ClusterConnectionString::ClusterConnectionString(std::string const& connectionString) {
auto trimmed = trim(connectionString);
// Split on '@' into key@addrs
int pAt = trimmed.find_first_of('@');
if (pAt == trimmed.npos)
throw connection_string_invalid();
std::string key = trimmed.substr(0, pAt);
std::string addrs = trimmed.substr(pAt + 1);
parseKey(key);
coord = NetworkAddress::parseList(addrs);
ASSERT(coord.size() > 0); // parseList() always returns at least one address if it doesn't throw
std::sort(coord.begin(), coord.end());
// Check that there are no duplicate addresses
if (std::unique(coord.begin(), coord.end()) != coord.end())
ACTOR Future<Void> resolveHostnamesImpl(ClusterConnectionString* self) {
std::vector<Future<Void>> fs;
for (auto const& hostName : self->hostnames) {
fs.push_back(map(INetworkConnections::net()->resolveTCPEndpoint(hostName.host, hostName.service),
[=](std::vector<NetworkAddress> const& addresses) -> Void {
NetworkAddress addr = addresses[deterministicRandom()->randomInt(0, addresses.size())];
addr.flags = 0; // Reset the parsed address to public
addr.fromHostname = NetworkAddressFromHostname::True;
if (hostName.isTLS) {
addr.flags |= NetworkAddress::FLAG_TLS;
}
self->addResolved(hostName, addr);
return Void();
}));
}
wait(waitForAll(fs));
std::sort(self->coords.begin(), self->coords.end());
if (std::unique(self->coords.begin(), self->coords.end()) != self->coords.end()) {
throw connection_string_invalid();
}
self->hasUnresolvedHostnames = false;
return Void();
}
TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/basic") {
Future<Void> ClusterConnectionString::resolveHostnames() {
if (!hasUnresolvedHostnames) {
return Void();
} else {
return resolveHostnamesImpl(this);
}
}
void ClusterConnectionString::resolveHostnamesBlocking() {
if (hasUnresolvedHostnames) {
for (auto const& hostname : hostnames) {
std::vector<NetworkAddress> addresses =
INetworkConnections::net()->resolveTCPEndpointBlocking(hostname.host, hostname.service);
NetworkAddress address = addresses[deterministicRandom()->randomInt(0, addresses.size())];
address.flags = 0; // Reset the parsed address to public
address.fromHostname = NetworkAddressFromHostname::True;
if (hostname.isTLS) {
address.flags |= NetworkAddress::FLAG_TLS;
}
coords.push_back(address);
networkAddressToHostname.emplace(address, hostname);
}
std::sort(coords.begin(), coords.end());
if (std::unique(coords.begin(), coords.end()) != coords.end()) {
throw connection_string_invalid();
}
hasUnresolvedHostnames = false;
}
}
void ClusterConnectionString::resetToUnresolved() {
if (hostnames.size() > 0) {
coords.clear();
hostnames.clear();
networkAddressToHostname.clear();
hasUnresolvedHostnames = true;
parseConnString();
}
}
void ClusterConnectionString::parseConnString() {
// Split on '@' into key@addrs
int pAt = connectionString.find_first_of('@');
if (pAt == connectionString.npos) {
throw connection_string_invalid();
}
std::string key = connectionString.substr(0, pAt);
std::string addrs = connectionString.substr(pAt + 1);
parseKey(key);
std::string curAddr;
for (int p = 0; p <= addrs.size();) {
int pComma = addrs.find_first_of(',', p);
if (pComma == addrs.npos)
pComma = addrs.size();
curAddr = addrs.substr(p, pComma - p);
if (Hostname::isHostname(curAddr)) {
hostnames.push_back(Hostname::parse(curAddr));
} else {
coords.push_back(NetworkAddress::parse(curAddr));
}
p = pComma + 1;
}
hasUnresolvedHostnames = hostnames.size() > 0;
ASSERT((coords.size() + hostnames.size()) > 0);
std::sort(coords.begin(), coords.end());
// Check that there are no duplicate addresses
if (std::unique(coords.begin(), coords.end()) != coords.end()) {
throw connection_string_invalid();
}
}
ClusterConnectionString::ClusterConnectionString(const std::string& connStr) {
connectionString = trim(connStr);
parseConnString();
}
TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/addresses") {
std::string input;
{
@ -157,6 +250,97 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/basic") {
return Void();
}
TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
std::string input;
{
input = "asdf:2345@localhost:1234";
ClusterConnectionString cs(input);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 1);
ASSERT(input == cs.toString());
}
{
input = "0xxdeadbeef:100100100@localhost:34534,host-name:23443";
ClusterConnectionString cs(input);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 2);
ASSERT(input == cs.toString());
}
{
input = "0xxdeadbeef:100100100@localhost:34534,host-name:23443";
std::string commented("#start of comment\n");
commented += input;
commented += "\n";
commented += "# asdfasdf ##";
ClusterConnectionString cs(commented);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 2);
ASSERT(input == cs.toString());
}
{
input = "0xxdeadbeef:100100100@localhost:34534,host-name_part1.host-name_part2:1234:tls";
std::string commented("#start of comment\n");
commented += input;
commented += "\n";
commented += "# asdfasdf ##";
ClusterConnectionString cs(commented);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 2);
ASSERT(input == cs.toString());
}
return Void();
}
TEST_CASE("/fdbclient/MonitorLeader/ConnectionString") {
state std::string connectionString = "TestCluster:0@localhost:1234,host-name:5678";
std::string hn1 = "localhost", port1 = "1234";
state std::string hn2 = "host-name";
state std::string port2 = "5678";
state std::vector<Hostname> hostnames;
hostnames.push_back(Hostname::parse(hn1 + ":" + port1));
hostnames.push_back(Hostname::parse(hn2 + ":" + port2));
NetworkAddress address1 = NetworkAddress::parse("127.0.0.0:1234");
NetworkAddress address2 = NetworkAddress::parse("127.0.0.1:5678");
INetworkConnections::net()->addMockTCPEndpoint(hn1, port1, { address1 });
INetworkConnections::net()->addMockTCPEndpoint(hn2, port2, { address2 });
state ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0"));
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 2);
ASSERT(cs.coordinators().size() == 0);
wait(cs.resolveHostnames());
ASSERT(!cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 2);
ASSERT(cs.coordinators().size() == 2);
ASSERT(cs.toString() == connectionString);
cs.resetToUnresolved();
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.hostnames.size() == 2);
ASSERT(cs.coordinators().size() == 0);
ASSERT(cs.toString() == connectionString);
INetworkConnections::net()->removeMockTCPEndpoint(hn2, port2);
NetworkAddress address3 = NetworkAddress::parse("127.0.0.0:5678");
INetworkConnections::net()->addMockTCPEndpoint(hn2, port2, { address3 });
try {
wait(cs.resolveHostnames());
} catch (Error& e) {
ASSERT(e.code() == error_code_connection_string_invalid);
}
return Void();
}
TEST_CASE("/flow/FlatBuffers/LeaderInfo") {
{
LeaderInfo in;
@ -237,29 +421,56 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") {
return Void();
}
ClusterConnectionString::ClusterConnectionString(std::vector<NetworkAddress> servers, Key key) : coord(servers) {
parseKey(key.toString());
ClusterConnectionString::ClusterConnectionString(const std::vector<NetworkAddress>& servers, Key key)
: coords(servers) {
std::string keyString = key.toString();
parseKey(keyString);
connectionString = keyString + "@";
for (int i = 0; i < coords.size(); i++) {
if (i) {
connectionString += ',';
}
connectionString += coords[i].toString();
}
}
void ClusterConnectionString::parseKey(std::string const& key) {
ClusterConnectionString::ClusterConnectionString(const std::vector<Hostname>& hosts, Key key)
: hasUnresolvedHostnames(true), hostnames(hosts) {
std::string keyString = key.toString();
parseKey(keyString);
connectionString = keyString + "@";
for (int i = 0; i < hostnames.size(); i++) {
if (i) {
connectionString += ',';
}
connectionString += hostnames[i].toString();
}
}
void ClusterConnectionString::parseKey(const std::string& key) {
// Check the structure of the given key, and fill in this->key and this->keyDesc
// The key must contain one (and only one) : character
int colon = key.find_first_of(':');
if (colon == key.npos)
if (colon == key.npos) {
throw connection_string_invalid();
}
std::string desc = key.substr(0, colon);
std::string id = key.substr(colon + 1);
// Check that description contains only allowed characters (a-z, A-Z, 0-9, _)
for (auto c = desc.begin(); c != desc.end(); ++c)
if (!(isalnum(*c) || *c == '_'))
for (auto c = desc.begin(); c != desc.end(); ++c) {
if (!(isalnum(*c) || *c == '_')) {
throw connection_string_invalid();
}
}
// Check that ID contains only allowed characters (a-z, A-Z, 0-9)
for (auto c = id.begin(); c != id.end(); ++c)
if (!isalnum(*c))
for (auto c = id.begin(); c != id.end(); ++c) {
if (!isalnum(*c)) {
throw connection_string_invalid();
}
}
this->key = StringRef(key);
this->keyDesc = StringRef(desc);
@ -268,11 +479,19 @@ void ClusterConnectionString::parseKey(std::string const& key) {
std::string ClusterConnectionString::toString() const {
std::string s = key.toString();
s += '@';
for (int i = 0; i < coord.size(); i++) {
if (i) {
for (int i = 0; i < coords.size(); i++) {
if (networkAddressToHostname.find(coords[i]) == networkAddressToHostname.end()) {
if (s.find('@') != s.length() - 1) {
s += ',';
}
s += coord[i].toString();
s += coords[i].toString();
}
}
for (auto const& host : hostnames) {
if (s.find('@') != s.length() - 1) {
s += ',';
}
s += host.toString();
}
return s;
}
@ -426,7 +645,7 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<IClusterCon
.detail("CurrentConnectionString",
info.intermediateConnRecord->getConnectionString().toString());
}
connRecord->setConnectionString(info.intermediateConnRecord->getConnectionString());
connRecord->setAndPersistConnectionString(info.intermediateConnRecord->getConnectionString());
info.intermediateConnRecord = connRecord;
}
@ -669,7 +888,7 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
Key traceLogGroup) {
state ClusterConnectionString cs = info.intermediateConnRecord->getConnectionString();
state std::vector<NetworkAddress> addrs = cs.coordinators();
state int idx = 0;
state int index = 0;
state int successIndex = 0;
state Optional<double> incorrectTime;
state std::vector<UID> lastCommitProxyUIDs;
@ -679,7 +898,7 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
deterministicRandom()->randomShuffle(addrs);
loop {
state ClientLeaderRegInterface clientLeaderServer(addrs[idx]);
state ClientLeaderRegInterface clientLeaderServer(addrs[index]);
state OpenDatabaseCoordRequest req;
coordinator->set(clientLeaderServer);
@ -732,7 +951,7 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
.detail("CurrentConnectionString",
info.intermediateConnRecord->getConnectionString().toString());
}
connRecord->setConnectionString(info.intermediateConnRecord->getConnectionString());
connRecord->setAndPersistConnectionString(info.intermediateConnRecord->getConnectionString());
info.intermediateConnRecord = connRecord;
}
@ -742,11 +961,11 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
auto& ni = rep.get().mutate();
shrinkProxyList(ni, lastCommitProxyUIDs, lastCommitProxies, lastGrvProxyUIDs, lastGrvProxies);
clientInfo->set(ni);
successIndex = idx;
successIndex = index;
} else {
TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller
idx = (idx + 1) % addrs.size();
if (idx == successIndex) {
index = (index + 1) % addrs.size();
if (index == successIndex) {
wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
}
}

View File

@ -2529,12 +2529,12 @@ THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
if (deterministicRandom()->coinflip()) {
if (deterministicRandom()->coinflip()) {
threads.push_back(g_network->startThread(releaseMem, tfp));
threads.push_back(g_network->startThread(releaseMem, tfp, 0, "fdb-release-mem"));
}
threads.push_back(g_network->startThread(cancel, tfp));
threads.push_back(g_network->startThread(cancel, tfp, 0, "fdb-cancel"));
undestroyed.push_back((ThreadSingleAssignmentVar<int>*)tfp);
} else {
threads.push_back(g_network->startThread(destroy, tfp));
threads.push_back(g_network->startThread(destroy, tfp, 0, "fdb-destroy"));
}
}
@ -2568,7 +2568,7 @@ struct AbortableTest {
if (!abort->isReady() && deterministicRandom()->coinflip()) {
ASSERT_EQ(abort->status, ThreadSingleAssignmentVarBase::Unset);
newFuture.threads.push_back(g_network->startThread(setAbort, abort));
newFuture.threads.push_back(g_network->startThread(setAbort, abort, 0, "fdb-abort"));
}
newFuture.legalErrors.insert(error_code_cluster_version_changed);

View File

@ -732,15 +732,12 @@ Future<Void> attemptGRVFromOldProxies(std::vector<GrvProxyInterface> oldProxies,
ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
Reference<AsyncVar<ClientDBInfo> const> clientDBInfo,
AsyncTrigger* proxyChangeTrigger,
AsyncTrigger* clientLibChangeTrigger) {
AsyncTrigger* proxyChangeTrigger) {
state std::vector<CommitProxyInterface> curCommitProxies;
state std::vector<GrvProxyInterface> curGrvProxies;
state ActorCollection actors(false);
state uint64_t curClientLibChangeCounter;
curCommitProxies = clientDBInfo->get().commitProxies;
curGrvProxies = clientDBInfo->get().grvProxies;
curClientLibChangeCounter = clientDBInfo->get().clientLibChangeCounter;
loop {
choose {
@ -763,9 +760,6 @@ ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
curGrvProxies = clientDBInfo->get().grvProxies;
proxyChangeTrigger->trigger();
}
if (curClientLibChangeCounter != clientDBInfo->get().clientLibChangeCounter) {
clientLibChangeTrigger->trigger();
}
}
when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); }
}
@ -1255,7 +1249,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger, &clientLibChangeTrigger);
clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger);
tssMismatchHandler = handleTssMismatches(this);
clientStatusUpdater.actor = clientStatusUpdateActor(this);
cacheListMonitor = monitorCacheList(this);
@ -1606,10 +1600,6 @@ Future<Void> DatabaseContext::onProxiesChanged() const {
return this->proxiesChangeTrigger.onTrigger();
}
Future<Void> DatabaseContext::onClientLibStatusChanged() const {
return this->clientLibChangeTrigger.onTrigger();
}
bool DatabaseContext::sampleReadTags() const {
double sampleRate = GlobalConfig::globalConfig().get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE);
return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate;
@ -1791,7 +1781,7 @@ void DatabaseContext::expireThrottles() {
}
}
extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
extern IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs);
// Creates a database object that represents a connection to a cluster
// This constructor uses a preallocated DatabaseContext that may have been created
@ -1804,6 +1794,8 @@ Database Database::createDatabase(Reference<IClusterConnectionRecord> connRecord
if (!g_network)
throw network_not_setup();
ASSERT(TraceEvent::isNetworkThread());
platform::ImageInfo imageInfo = platform::getImageInfo();
if (connRecord) {
@ -1815,6 +1807,12 @@ Database Database::createDatabase(Reference<IClusterConnectionRecord> connRecord
auto publicIP = determinePublicIPAutomatically(connRecord->getConnectionString());
selectTraceFormatter(networkOptions.traceFormat);
selectTraceClockSource(networkOptions.traceClockSource);
addUniversalTraceField("ClientDescription",
format("%s-%s-%" PRIu64,
networkOptions.primaryClient ? "primary" : "external",
FDB_VT_VERSION,
getTraceThreadId()));
openTraceFile(NetworkAddress(publicIP, ::getpid()),
networkOptions.traceRollSize,
networkOptions.traceMaxLogsSize,
@ -2626,6 +2624,42 @@ ACTOR Future<Void> warmRange_impl(Reference<TransactionState> trState, KeyRange
return Void();
}
SpanID generateSpanID(bool transactionTracingSample, SpanID parentContext = SpanID()) {
uint64_t txnId = deterministicRandom()->randomUInt64();
if (parentContext.isValid()) {
if (parentContext.first() > 0) {
txnId = parentContext.first();
}
uint64_t tokenId = parentContext.second() > 0 ? deterministicRandom()->randomUInt64() : 0;
return SpanID(txnId, tokenId);
} else if (transactionTracingSample) {
uint64_t tokenId = deterministicRandom()->random01() <= FLOW_KNOBS->TRACING_SAMPLE_RATE
? deterministicRandom()->randomUInt64()
: 0;
return SpanID(txnId, tokenId);
} else {
return SpanID(txnId, 0);
}
}
Reference<TransactionState> TransactionState::cloneAndReset(Reference<TransactionLogInfo> newTrLogInfo,
bool generateNewSpan) const {
SpanID newSpanID = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanID;
Reference<TransactionState> newState = makeReference<TransactionState>(cx, cx->taskID, newSpanID, newTrLogInfo);
if (!cx->apiVersionAtLeast(16)) {
newState->options = options;
}
newState->numErrors = numErrors;
newState->startTime = startTime;
newState->committedVersion = committedVersion;
newState->conflictingKeys = conflictingKeys;
return newState;
}
Future<Void> Transaction::warmRange(KeyRange keys) {
return warmRange_impl(trState, keys);
}
@ -4273,24 +4307,6 @@ void debugAddTags(Reference<TransactionState> trState) {
}
}
SpanID generateSpanID(bool transactionTracingSample, SpanID parentContext = SpanID()) {
uint64_t txnId = deterministicRandom()->randomUInt64();
if (parentContext.isValid()) {
if (parentContext.first() > 0) {
txnId = parentContext.first();
}
uint64_t tokenId = parentContext.second() > 0 ? deterministicRandom()->randomUInt64() : 0;
return SpanID(txnId, tokenId);
} else if (transactionTracingSample) {
uint64_t tokenId = deterministicRandom()->random01() <= FLOW_KNOBS->TRACING_SAMPLE_RATE
? deterministicRandom()->randomUInt64()
: 0;
return SpanID(txnId, tokenId);
} else {
return SpanID(txnId, 0);
}
}
Transaction::Transaction()
: trState(makeReference<TransactionState>(TaskPriority::DefaultEndpoint, generateSpanID(false))) {}
@ -4923,28 +4939,24 @@ void TransactionOptions::reset(Database const& cx) {
}
}
void Transaction::reset() {
void Transaction::resetImpl(bool generateNewSpan) {
flushTrLogsIfEnabled();
trState = trState->cloneAndReset(createTrLogInfoProbabilistically(trState->cx), generateNewSpan);
tr = CommitTransactionRequest(trState->spanID);
readVersion = Future<Version>();
metadataVersion = Promise<Optional<Key>>();
extraConflictRanges.clear();
commitResult = Promise<Void>();
committing = Future<Void>();
flushTrLogsIfEnabled();
trState->versionstampPromise = Promise<Standalone<StringRef>>();
trState->taskID = trState->cx->taskID;
trState->debugID = Optional<UID>();
trState->trLogInfo = Reference<TransactionLogInfo>(createTrLogInfoProbabilistically(trState->cx));
cancelWatches();
if (apiVersionAtLeast(16)) {
trState->options.reset(trState->cx);
}
void Transaction::reset() {
resetImpl(false);
}
void Transaction::fullReset() {
trState->spanID = generateSpanID(trState->cx->transactionTracingSample);
reset();
resetImpl(true);
span = Span(trState->spanID, "Transaction"_loc);
backoff = CLIENT_KNOBS->DEFAULT_BACKOFF;
}
@ -7260,6 +7272,7 @@ ACTOR Future<KeyRange> getChangeFeedRange(Reference<DatabaseContext> db, Databas
loop {
try {
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Version readVer = wait(tr.getReadVersion());
if (readVer < begin) {
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
@ -7531,6 +7544,7 @@ ACTOR static Future<Void> popChangeFeedBackup(Database cx, Key rangeID, Version
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix);
Optional<Value> val = wait(tr.get(rangeIDKey));
if (val.present()) {

View File

@ -258,6 +258,8 @@ struct TransactionState : ReferenceCounted<TransactionState> {
TransactionState(Database cx, TaskPriority taskID, SpanID spanID, Reference<TransactionLogInfo> trLogInfo)
: cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID) {}
Reference<TransactionState> cloneAndReset(Reference<TransactionLogInfo> newTrLogInfo, bool generateNewSpan) const;
};
class Transaction : NonCopyable {
@ -441,6 +443,8 @@ private:
Snapshot snapshot,
Reverse reverse);
void resetImpl(bool generateNewSpan);
double backoff;
CommitTransactionRequest tr;
Future<Version> readVersion;

View File

@ -57,15 +57,16 @@ class CommitQuorum {
ConfigGeneration generation,
ConfigTransactionInterface cti) {
try {
wait(retryBrokenPromise(cti.commit, self->getCommitRequest(generation)));
wait(timeoutError(cti.commit.getReply(self->getCommitRequest(generation)),
CLIENT_KNOBS->COMMIT_QUORUM_TIMEOUT));
++self->successful;
} catch (Error& e) {
// self might be destroyed if this actor is canceled
// self might be destroyed if this actor is cancelled
if (e.code() == error_code_actor_cancelled) {
throw;
}
if (e.code() == error_code_not_committed) {
if (e.code() == error_code_not_committed || e.code() == error_code_timed_out) {
++self->failed;
} else {
++self->maybeCommitted;
@ -117,12 +118,16 @@ class GetGenerationQuorum {
Future<ConfigGeneration> getGenerationFuture;
ACTOR static Future<Void> addRequestActor(GetGenerationQuorum* self, ConfigTransactionInterface cti) {
ConfigTransactionGetGenerationReply reply = wait(
retryBrokenPromise(cti.getGeneration, ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion }));
loop {
try {
ConfigTransactionGetGenerationReply reply = wait(timeoutError(
cti.getGeneration.getReply(ConfigTransactionGetGenerationRequest{ self->lastSeenLiveVersion }),
CLIENT_KNOBS->GET_GENERATION_QUORUM_TIMEOUT));
++self->totalRepliesReceived;
auto gen = reply.generation;
self->lastSeenLiveVersion = std::max(gen.liveVersion, self->lastSeenLiveVersion.orDefault(::invalidVersion));
self->lastSeenLiveVersion =
std::max(gen.liveVersion, self->lastSeenLiveVersion.orDefault(::invalidVersion));
auto& replicas = self->seenGenerations[gen];
replicas.push_back(cti);
self->maxAgreement = std::max(replicas.size(), self->maxAgreement);
@ -134,6 +139,22 @@ class GetGenerationQuorum {
self->result.sendError(failed_to_reach_quorum());
}
}
break;
} catch (Error& e) {
if (e.code() == error_code_broken_promise) {
continue;
} else if (e.code() == error_code_timed_out) {
++self->totalRepliesReceived;
if (self->totalRepliesReceived == self->ctis.size() && self->result.canBeSet() &&
!self->result.isError()) {
self->result.sendError(failed_to_reach_quorum());
}
break;
} else {
throw;
}
}
}
return Void();
}
@ -151,9 +172,11 @@ class GetGenerationQuorum {
} catch (Error& e) {
if (e.code() == error_code_failed_to_reach_quorum) {
TEST(true); // Failed to reach quorum getting generation
wait(delayJittered(0.01 * (1 << retries)));
wait(delayJittered(
std::clamp(0.005 * (1 << retries), 0.0, CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND)));
++retries;
self->actors.clear(false);
self->seenGenerations.clear();
self->result.reset();
self->totalRepliesReceived = 0;
self->maxAgreement = 0;
@ -198,15 +221,26 @@ class PaxosConfigTransactionImpl {
ACTOR static Future<Optional<Value>> get(PaxosConfigTransactionImpl* self, Key key) {
state ConfigKey configKey = ConfigKey::decodeKey(key);
loop {
try {
ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
// TODO: Load balance
ConfigTransactionGetReply reply = wait(retryBrokenPromise(
self->getGenerationQuorum.getReadReplicas()[0].get, ConfigTransactionGetRequest{ generation, configKey }));
ConfigTransactionGetReply reply =
wait(timeoutError(self->getGenerationQuorum.getReadReplicas()[0].get.getReply(
ConfigTransactionGetRequest{ generation, configKey }),
CLIENT_KNOBS->GET_KNOB_TIMEOUT));
if (reply.value.present()) {
return reply.value.get().toValue();
} else {
return Optional<Value>{};
}
} catch (Error& e) {
if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) {
throw;
}
self->reset();
}
}
}
ACTOR static Future<RangeResult> getConfigClasses(PaxosConfigTransactionImpl* self) {
@ -248,7 +282,9 @@ class PaxosConfigTransactionImpl {
// TODO: Improve this:
TraceEvent("ConfigIncrementOnError").error(e).detail("NumRetries", self->numRetries);
if (e.code() == error_code_transaction_too_old || e.code() == error_code_not_committed) {
wait(delay((1 << self->numRetries++) * 0.01 * deterministicRandom()->random01()));
wait(delay(std::clamp((1 << self->numRetries++) * 0.01 * deterministicRandom()->random01(),
0.0,
CLIENT_KNOBS->TIMEOUT_RETRY_UPPER_BOUND)));
self->reset();
return Void();
}

View File

@ -24,9 +24,15 @@
#include "fdbclient/libb64/encode.h"
#include "fdbclient/sha1/SHA1.h"
#include <time.h>
#include <iomanip>
#include <openssl/sha.h>
#include <openssl/evp.h>
#include <openssl/hmac.h>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string.hpp>
#include "fdbrpc/IAsyncFile.h"
#include "flow/UnitTest.h"
#include "fdbclient/rapidxml/rapidxml.hpp"
#include "flow/actorcompiler.h" // has to be last include
@ -246,8 +252,9 @@ Reference<S3BlobStoreEndpoint> S3BlobStoreEndpoint::fromString(std::string const
if (cred.present()) {
StringRef c(cred.get());
StringRef key = c.eat(":");
StringRef secret = c.eat();
creds = S3BlobStoreEndpoint::Credentials{ key.toString(), secret.toString() };
StringRef secret = c.eat(":");
StringRef securityToken = c.eat();
creds = S3BlobStoreEndpoint::Credentials{ key.toString(), secret.toString(), securityToken.toString() };
}
return makeReference<S3BlobStoreEndpoint>(host.toString(), service.toString(), creds, knobs, extraHeaders);
@ -274,10 +281,18 @@ std::string S3BlobStoreEndpoint::getResourceURL(std::string resource, std::strin
// If secret isn't being looked up from credentials files then it was passed explicitly in the URL so show it here.
std::string credsString;
if (credentials.present()) {
if (!lookupKey) {
credsString = credentials.get().key;
}
if (!lookupSecret) {
credsString += ":" + credentials.get().secret;
}
if (!lookupSecret) {
credsString +=
credentials.get().securityToken.empty()
? std::string(":") + credentials.get().secret
: std::string(":") + credentials.get().secret + std::string(":") + credentials.get().securityToken;
}
credsString += "@";
}
@ -506,7 +521,7 @@ ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
wait(waitForAll(reads));
std::string accessKey = b->credentials.get().key;
std::string accessKey = b->lookupKey ? "" : b->credentials.get().key;
std::string credentialsFileKey = accessKey + "@" + b->host;
int invalid = 0;
@ -523,12 +538,26 @@ ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
JSONDoc accounts(doc.last().get_obj());
if (accounts.has(credentialsFileKey, false) && accounts.last().type() == json_spirit::obj_type) {
JSONDoc account(accounts.last());
std::string secret;
// Once we find a matching account, use it.
if (account.tryGet("secret", secret)) {
b->credentials = S3BlobStoreEndpoint::Credentials{ accessKey, secret };
return Void();
S3BlobStoreEndpoint::Credentials creds;
if (b->lookupKey) {
std::string apiKey;
if (account.tryGet("api_key", apiKey))
creds.key = apiKey;
else
continue;
}
if (b->lookupSecret) {
std::string secret;
if (account.tryGet("secret", secret))
creds.secret = secret;
else
continue;
}
std::string token;
if (account.tryGet("token", token))
creds.securityToken = token;
b->credentials = creds;
return Void();
}
}
}
@ -572,7 +601,7 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
.detail("RemoteEndpoint", conn->getPeerAddress())
.detail("ExpiresIn", b->knobs.max_connection_life);
if (b->lookupSecret)
if (b->lookupKey || b->lookupSecret)
wait(b->updateSecret());
return S3BlobStoreEndpoint::ReusableConnection({ conn, now() + b->knobs.max_connection_life });
@ -660,7 +689,13 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
// Finish/update the request headers (which includes Date header)
// This must be done AFTER the connection is ready because if credentials are coming from disk they are
// refreshed when a new connection is established and setAuthHeaders() would need the updated secret.
if (bstore->credentials.present() && !bstore->credentials.get().securityToken.empty())
headers["x-amz-security-token"] = bstore->credentials.get().securityToken;
if (CLIENT_KNOBS->HTTP_REQUEST_AWS_V4_HEADER) {
bstore->setV4AuthHeaders(verb, resource, headers);
} else {
bstore->setAuthHeaders(verb, resource, headers);
}
remoteAddress = rconn.conn->getPeerAddress();
wait(bstore->requestRate->getAllowance(1));
@ -1084,6 +1119,160 @@ std::string S3BlobStoreEndpoint::hmac_sha1(Credentials const& creds, std::string
return SHA1::from_string(kopad);
}
std::string sha256_hex(std::string str) {
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_CTX sha256;
SHA256_Init(&sha256);
SHA256_Update(&sha256, str.c_str(), str.size());
SHA256_Final(hash, &sha256);
std::stringstream ss;
for (int i = 0; i < SHA256_DIGEST_LENGTH; i++) {
ss << std::hex << std::setw(2) << std::setfill('0') << (int)hash[i];
}
return ss.str();
}
std::string hmac_sha256_hex(std::string key, std::string msg) {
unsigned char hash[32];
HMAC_CTX* hmac = HMAC_CTX_new();
HMAC_Init_ex(hmac, &key[0], key.length(), EVP_sha256(), NULL);
HMAC_Update(hmac, (unsigned char*)&msg[0], msg.length());
unsigned int len = 32;
HMAC_Final(hmac, hash, &len);
HMAC_CTX_free(hmac);
std::stringstream ss;
ss << std::hex << std::setfill('0');
for (int i = 0; i < len; i++) {
ss << std::hex << std::setw(2) << (unsigned int)hash[i];
}
return (ss.str());
}
std::string hmac_sha256(std::string key, std::string msg) {
unsigned char hash[32];
HMAC_CTX* hmac = HMAC_CTX_new();
HMAC_Init_ex(hmac, &key[0], key.length(), EVP_sha256(), NULL);
HMAC_Update(hmac, (unsigned char*)&msg[0], msg.length());
unsigned int len = 32;
HMAC_Final(hmac, hash, &len);
HMAC_CTX_free(hmac);
std::stringstream ss;
ss << std::setfill('0');
for (int i = 0; i < len; i++) {
ss << hash[i];
}
return (ss.str());
}
// Date and Time parameters are used for unit testing
void S3BlobStoreEndpoint::setV4AuthHeaders(std::string const& verb,
std::string const& resource,
HTTP::Headers& headers,
std::string date,
std::string datestamp) {
if (!credentials.present()) {
return;
}
Credentials creds = credentials.get();
// std::cout << "========== Starting===========" << std::endl;
std::string accessKey = creds.key;
std::string secretKey = creds.secret;
// Create a date for headers and the credential string
std::string amzDate;
std::string dateStamp;
if (date.empty() || datestamp.empty()) {
time_t ts;
time(&ts);
char dateBuf[20];
// ISO 8601 format YYYYMMDD'T'HHMMSS'Z'
strftime(dateBuf, 20, "%Y%m%dT%H%M%SZ", gmtime(&ts));
amzDate = dateBuf;
strftime(dateBuf, 20, "%Y%m%d", gmtime(&ts));
dateStamp = dateBuf;
} else {
amzDate = date;
dateStamp = datestamp;
}
// Extract service and region
StringRef hostRef(host);
std::string service = hostRef.eat(".").toString();
std::string region = hostRef.eat(".").toString();
// ************* TASK 1: CREATE A CANONICAL REQUEST *************
// Create Create canonical URI--the part of the URI from domain to query string (use '/' if no path)
StringRef resourceRef(resource);
resourceRef.eat("/");
std::string canonicalURI("/" + resourceRef.toString());
size_t q = canonicalURI.find_last_of('?');
if (q != canonicalURI.npos)
canonicalURI.resize(q);
canonicalURI = HTTP::awsV4URIEncode(canonicalURI, false);
// Create the canonical query string
std::string queryString;
q = resource.find_last_of('?');
if (q != queryString.npos)
queryString = resource.substr(q + 1);
std::vector<std::string> queryParameters;
StringRef qStr(queryString);
StringRef queryParameter;
while ((queryParameter = qStr.eat("&")) != StringRef()) {
StringRef param = queryParameter.eat("=");
StringRef value = queryParameter.eat();
queryParameters.push_back(HTTP::awsV4URIEncode(param.toString(), true) + "=" +
HTTP::awsV4URIEncode(value.toString(), true));
}
std::sort(queryParameters.begin(), queryParameters.end());
std::string canonicalQueryString = boost::algorithm::join(queryParameters, "&");
using namespace boost::algorithm;
// Create the canonical headers and signed headers
ASSERT(!headers["Host"].empty());
// Using unsigned payload here and adding content-md5 to the signed headers. It may be better to also include sha256
// sum for added security.
headers["x-amz-content-sha256"] = "UNSIGNED-PAYLOAD";
headers["x-amz-date"] = amzDate;
std::vector<std::pair<std::string, std::string>> headersList;
headersList.push_back({ "host", trim_copy(headers["Host"]) + "\n" });
if (headers.find("Content-Type") != headers.end())
headersList.push_back({ "content-type", trim_copy(headers["Content-Type"]) + "\n" });
if (headers.find("Content-MD5") != headers.end())
headersList.push_back({ "content-md5", trim_copy(headers["Content-MD5"]) + "\n" });
for (auto h : headers) {
if (StringRef(h.first).startsWith(LiteralStringRef("x-amz")))
headersList.push_back({ to_lower_copy(h.first), trim_copy(h.second) + "\n" });
}
std::sort(headersList.begin(), headersList.end());
std::string canonicalHeaders;
std::string signedHeaders;
for (auto& i : headersList) {
canonicalHeaders += i.first + ":" + i.second;
signedHeaders += i.first + ";";
}
signedHeaders.pop_back();
std::string canonicalRequest = verb + "\n" + canonicalURI + "\n" + canonicalQueryString + "\n" + canonicalHeaders +
"\n" + signedHeaders + "\n" + headers["x-amz-content-sha256"];
// ************* TASK 2: CREATE THE STRING TO SIGN*************
std::string algorithm = "AWS4-HMAC-SHA256";
std::string credentialScope = dateStamp + "/" + region + "/" + service + "/" + "aws4_request";
std::string stringToSign =
algorithm + "\n" + amzDate + "\n" + credentialScope + "\n" + sha256_hex(canonicalRequest);
// ************* TASK 3: CALCULATE THE SIGNATURE *************
// Create the signing key using the function defined above.
std::string signingKey = hmac_sha256(
hmac_sha256(hmac_sha256(hmac_sha256("AWS4" + secretKey, dateStamp), region), service), "aws4_request");
// Sign the string_to_sign using the signing_key
std::string signature = hmac_sha256_hex(signingKey, stringToSign);
// ************* TASK 4: ADD SIGNING INFORMATION TO THE Header *************
std::string authorizationHeader = algorithm + " " + "Credential=" + accessKey + "/" + credentialScope + ", " +
"SignedHeaders=" + signedHeaders + ", " + "Signature=" + signature;
headers["Authorization"] = authorizationHeader;
}
void S3BlobStoreEndpoint::setAuthHeaders(std::string const& verb, std::string const& resource, HTTP::Headers& headers) {
if (!credentials.present()) {
return;
@ -1173,6 +1362,8 @@ ACTOR Future<Void> writeEntireFileFromBuffer_impl(Reference<S3BlobStoreEndpoint>
HTTP::Headers headers;
// Send MD5 sum for content so blobstore can verify it
headers["Content-MD5"] = contentMD5;
if (!CLIENT_KNOBS->BLOBSTORE_ENCRYPTION_TYPE.empty())
headers["x-amz-server-side-encryption"] = CLIENT_KNOBS->BLOBSTORE_ENCRYPTION_TYPE;
state Reference<HTTP::Response> r =
wait(bstore->doRequest("PUT", resource, headers, pContent, contentLen, { 200 }));
@ -1266,6 +1457,8 @@ ACTOR static Future<std::string> beginMultiPartUpload_impl(Reference<S3BlobStore
std::string resource = std::string("/") + bucket + "/" + object + "?uploads";
HTTP::Headers headers;
if (!CLIENT_KNOBS->BLOBSTORE_ENCRYPTION_TYPE.empty())
headers["x-amz-server-side-encryption"] = CLIENT_KNOBS->BLOBSTORE_ENCRYPTION_TYPE;
Reference<HTTP::Response> r = wait(bstore->doRequest("POST", resource, headers, nullptr, 0, { 200 }));
try {
@ -1376,3 +1569,56 @@ Future<Void> S3BlobStoreEndpoint::finishMultiPartUpload(std::string const& bucke
MultiPartSetT const& parts) {
return finishMultiPartUpload_impl(Reference<S3BlobStoreEndpoint>::addRef(this), bucket, object, uploadID, parts);
}
TEST_CASE("/backup/s3/v4headers") {
S3BlobStoreEndpoint::Credentials creds{ "AKIAIOSFODNN7EXAMPLE", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "" }
// GET without query parameters
{
S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", creds);
std::string verb("GET");
std::string resource("/test.txt");
HTTP::Headers headers;
headers["Host"] = "s3.amazonaws.com";
s3.setV4AuthHeaders(verb, resource, headers, "20130524T000000Z", "20130524");
ASSERT(headers["Authorization"] ==
"AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20130524/amazonaws/s3/aws4_request, "
"SignedHeaders=host;x-amz-content-sha256;x-amz-date, "
"Signature=c6037f4b174f2019d02d7085a611cef8adfe1efe583e220954dc85d59cd31ba3");
ASSERT(headers["x-amz-date"] == "20130524T000000Z");
}
// GET with query parameters
{
S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", creds);
std::string verb("GET");
std::string resource("/test/examplebucket?Action=DescribeRegions&Version=2013-10-15");
HTTP::Headers headers;
headers["Host"] = "s3.amazonaws.com";
s3.setV4AuthHeaders(verb, resource, headers, "20130524T000000Z", "20130524");
ASSERT(headers["Authorization"] ==
"AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20130524/amazonaws/s3/aws4_request, "
"SignedHeaders=host;x-amz-content-sha256;x-amz-date, "
"Signature=426f04e71e191fbc30096c306fe1b11ce8f026a7be374541862bbee320cce71c");
ASSERT(headers["x-amz-date"] == "20130524T000000Z");
}
// POST
{
S3BlobStoreEndpoint s3("s3.us-west-2.amazonaws.com", "s3", creds);
std::string verb("POST");
std::string resource("/simple.json");
HTTP::Headers headers;
headers["Host"] = "s3.us-west-2.amazonaws.com";
headers["Content-Type"] = "Application/x-amz-json-1.0";
s3.setV4AuthHeaders(verb, resource, headers, "20130524T000000Z", "20130524");
ASSERT(headers["Authorization"] ==
"AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20130524/us-west-2/s3/aws4_request, "
"SignedHeaders=content-type;host;x-amz-content-sha256;x-amz-date, "
"Signature=cf095e36bed9cd3139c2e8b3e20c296a79d8540987711bf3a0d816b19ae00314");
ASSERT(headers["x-amz-date"] == "20130524T000000Z");
ASSERT(headers["Host"] == "s3.us-west-2.amazonaws.com");
ASSERT(headers["Content-Type"] == "Application/x-amz-json-1.0");
}
return Void();
}

View File

@ -49,6 +49,7 @@ public:
struct Credentials {
std::string key;
std::string secret;
std::string securityToken;
};
struct BlobKnobs {
@ -100,8 +101,9 @@ public:
Optional<Credentials> const& creds,
BlobKnobs const& knobs = BlobKnobs(),
HTTP::Headers extraHeaders = HTTP::Headers())
: host(host), service(service), credentials(creds), lookupSecret(creds.present() && creds.get().secret.empty()),
knobs(knobs), extraHeaders(extraHeaders), requestRate(new SpeedLimit(knobs.requests_per_second, 1)),
: host(host), service(service), credentials(creds), lookupKey(creds.present() && creds.get().key.empty()),
lookupSecret(creds.present() && creds.get().secret.empty()), knobs(knobs), extraHeaders(extraHeaders),
requestRate(new SpeedLimit(knobs.requests_per_second, 1)),
requestRateList(new SpeedLimit(knobs.list_requests_per_second, 1)),
requestRateWrite(new SpeedLimit(knobs.write_requests_per_second, 1)),
requestRateRead(new SpeedLimit(knobs.read_requests_per_second, 1)),
@ -118,7 +120,8 @@ public:
const char* resource = "";
if (withResource)
resource = "<name>";
return format("blobstore://[<api_key>:<secret>@]<host>[:<port>]/%s[?<param>=<value>[&<param>=<value>]...]",
return format(
"blobstore://<api_key>:<secret>:<security_token>@<host>[:<port>]/%s[?<param>=<value>[&<param>=<value>]...]",
resource);
}
@ -147,6 +150,7 @@ public:
std::string host;
std::string service;
Optional<Credentials> credentials;
bool lookupKey;
bool lookupSecret;
BlobKnobs knobs;
HTTP::Headers extraHeaders;
@ -171,6 +175,13 @@ public:
// Sets headers needed for Authorization (including Date which will be overwritten if present)
void setAuthHeaders(std::string const& verb, std::string const& resource, HTTP::Headers& headers);
// Set headers in the AWS V4 authorization format. $date and $datestamp are used for unit testing
void setV4AuthHeaders(const std::string& verb,
const std::string& resource,
HTTP::Headers& headers,
std::string date = "",
std::string datestamp = "");
// Prepend the HTTP request header to the given PacketBuffer, returning the new head of the buffer chain
static PacketBuffer* writeRequestHeader(std::string const& request,
HTTP::Headers const& headers,

View File

@ -24,6 +24,36 @@
const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
{
"cluster":{
"storage_wiggler": {
"primary": {
"last_round_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_round_start_timestamp": 63811229797,
"last_round_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_wiggle_start_timestamp": 63811229797,
"last_wiggle_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
},
"remote": {
"last_round_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_round_start_timestamp": 63811229797,
"last_round_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_wiggle_start_timestamp": 63811229797,
"last_wiggle_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
}
},
"layers":{
"_valid":true,
"_error":"some error description"
@ -97,11 +127,16 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"ratekeeper",
"blob_manager",
"blob_worker",
"encrypt_key_proxy",
"storage_cache",
"router",
"coordinator"
]
},
"storage_metadata":{
"created_time_datetime":"Thu Jan 1 00:00:00 1970 +0000",
"created_time_timestamp": 0
},
"data_version":12341234,
"durable_version":12341234,
"data_lag": {
@ -497,6 +532,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"unreachable_dataDistributor_worker",
"unreachable_ratekeeper_worker",
"unreachable_blobManager_worker",
"unreachable_encryptKeyProxy_worker",
"unreadable_configuration",
"full_replication_timeout",
"client_issues",
@ -1041,19 +1077,3 @@ const KeyRef JSONSchemas::managementApiErrorSchema = LiteralStringRef(R"""(
"message": "The reason of the error"
}
)""");
const KeyRef JSONSchemas::clientLibMetadataSchema = LiteralStringRef(R"""(
{
"platform": "x86_64-linux",
"version": "7.1.0",
"githash": "e28fef6264d05ab0c9488238022d1ee885a30bea",
"type": "debug",
"checksum": "fcef53fb4ae86d2c4fff4dc17c7e5d08",
"checksumalg": "md5",
"apiversion": 710,
"protocol": "fdb00b07001001",
"filename": "libfdb_c.7.1.0.so",
"size" : 19467552,
"chunkcount" : 2377,
"status": "available"
})""");

View File

@ -35,7 +35,6 @@ struct JSONSchemas {
static const KeyRef storageHealthSchema;
static const KeyRef aggregateHealthSchema;
static const KeyRef managementApiErrorSchema;
static const KeyRef clientLibMetadataSchema;
};
#endif /* FDBCLIENT_SCHEMAS_H */

View File

@ -148,7 +148,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RETRY_RELOCATESHARD_DELAY, 0.1 );
init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0;
bool buggifySmallShards = randomize && BUGGIFY;
init( MIN_SHARD_BYTES, 200000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair
bool simulationMediumShards = !buggifySmallShards && randomize && !BUGGIFY; // prefer smaller shards in simulation
init( MIN_SHARD_BYTES, 50000000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair
init( SHARD_BYTES_RATIO, 4 );
init( SHARD_BYTES_PER_SQRT_BYTES, 45 ); if( buggifySmallShards ) SHARD_BYTES_PER_SQRT_BYTES = 0;//Approximately 10000 bytes per shard
init( MAX_SHARD_BYTES, 500000000 );
@ -355,6 +356,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_FETCH_QUEUE_HARD_MAX, 100 );
init( ROCKSDB_FETCH_QUEUE_SOFT_MAX, 50 );
init( ROCKSDB_HISTOGRAMS_SAMPLE_RATE, 0.001 ); if( randomize && BUGGIFY ) ROCKSDB_HISTOGRAMS_SAMPLE_RATE = 0;
init( ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME, 30.0 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME = 0.1;
init( ROCKSDB_READ_RANGE_REUSE_ITERATORS, true ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_ITERATORS = deterministicRandom()->coinflip() ? true : false;
// Set to 0 to disable rocksdb write rate limiting. Rate limiter unit: bytes per second.
init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 );
// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE, true );
// Leader election
bool longLeaderElection = randomize && BUGGIFY;
@ -471,6 +479,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 1.0 );
init( WAIT_FOR_RATEKEEPER_JOIN_DELAY, 1.0 );
init( WAIT_FOR_BLOB_MANAGER_JOIN_DELAY, 1.0 );
init( WAIT_FOR_ENCRYPT_KEY_PROXY_JOIN_DELAY, 1.0 );
init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0;
init( CHECK_OUTSTANDING_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) CHECK_OUTSTANDING_INTERVAL = 0.001;
init( VERSION_LAG_METRIC_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) VERSION_LAG_METRIC_INTERVAL = 10.0;
@ -488,6 +497,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( CC_WORKER_HEALTH_CHECKING_INTERVAL, 60.0 );
init( CC_DEGRADED_LINK_EXPIRATION_INTERVAL, 300.0 );
init( CC_MIN_DEGRADATION_INTERVAL, 120.0 );
init( ENCRYPT_KEY_PROXY_FAILURE_TIME, 0.1 );
init( CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE, 3 );
init( CC_MAX_EXCLUSION_DUE_TO_HEALTH, 2 );
init( CC_HEALTH_TRIGGER_RECOVERY, false );
@ -536,6 +546,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( METRIC_UPDATE_RATE, .1 ); if( slowRatekeeper ) METRIC_UPDATE_RATE = 0.5;
init( DETAILED_METRIC_UPDATE_RATE, 5.0 );
init (RATEKEEPER_DEFAULT_LIMIT, 1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0;
init( RATEKEEPER_LIMIT_REASON_SAMPLE_RATE, 0.1 );
init( RATEKEEPER_PRINT_LIMIT_REASON, false ); if( randomize && BUGGIFY ) RATEKEEPER_PRINT_LIMIT_REASON = true;
bool smallStorageTarget = randomize && BUGGIFY;
init( TARGET_BYTES_PER_STORAGE_SERVER, 1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3;
@ -686,7 +698,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( COORDINATOR_LEADER_CONNECTION_TIMEOUT, 20.0 );
// Dynamic Knobs (implementation)
init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 ); // Maximum time the consumer should wait for a response from a ConfigNode when asking for the latest committed version.
init( UPDATE_NODE_TIMEOUT, 3.0 );
init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 );
init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT, 3.0 );
init( FETCH_CHANGES_TIMEOUT, 3.0 );
// Buggification
init( BUGGIFIED_EVENTUAL_CONSISTENCY, 1.0 );
@ -760,7 +775,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FASTRESTORE_RATE_UPDATE_SECONDS, 1.0 ); if( randomize && BUGGIFY ) { FASTRESTORE_RATE_UPDATE_SECONDS = deterministicRandom()->random01() < 0.5 ? 0.1 : 2;}
init( FASTRESTORE_DUMP_INSERT_RANGE_VERSION, false );
init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 );
init( REDWOOD_DEFAULT_PAGE_SIZE, 8192 );
init( REDWOOD_DEFAULT_EXTENT_SIZE, 32 * 1024 * 1024 );
init( REDWOOD_DEFAULT_EXTENT_READ_SIZE, 1024 * 1024 );
init( REDWOOD_EXTENT_CONCURRENT_READS, 4 );
@ -784,6 +799,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Cluster recovery
init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX, "Master");
// encrypt key proxy
init( ENABLE_ENCRYPT_KEY_PROXY, false );
// Blob granlues
init( BG_URL, "" ); // TODO: store in system key space, eventually
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( randomize && BUGGIFY ) { deterministicRandom()->random01() < 0.1 ? BG_SNAPSHOT_FILE_TARGET_BYTES /= 100 : BG_SNAPSHOT_FILE_TARGET_BYTES /= 10; }

View File

@ -289,6 +289,10 @@ public:
// These histograms are in read and write path which can cause performance overhead.
// Set to 0 to disable histograms.
double ROCKSDB_HISTOGRAMS_SAMPLE_RATE;
double ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME;
bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
// Leader election
int MAX_NOTIFICATIONS;
@ -396,6 +400,7 @@ public:
double WAIT_FOR_DISTRIBUTOR_JOIN_DELAY;
double WAIT_FOR_RATEKEEPER_JOIN_DELAY;
double WAIT_FOR_BLOB_MANAGER_JOIN_DELAY;
double WAIT_FOR_ENCRYPT_KEY_PROXY_JOIN_DELAY;
double WORKER_FAILURE_TIME;
double CHECK_OUTSTANDING_INTERVAL;
double INCOMPATIBLE_PEERS_LOGGING_INTERVAL;
@ -418,6 +423,7 @@ public:
// degraded server is considered healthy.
double CC_MIN_DEGRADATION_INTERVAL; // The minimum interval that a server is reported as degraded to be considered
// as degraded by Cluster Controller.
double ENCRYPT_KEY_PROXY_FAILURE_TIME;
int CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE; // The maximum number of degraded peers when excluding a server. When the
// number of degraded peers is more than this value, we will not exclude
// this server since it may because of server overload.
@ -476,6 +482,8 @@ public:
double DETAILED_METRIC_UPDATE_RATE;
double LAST_LIMITED_RATIO;
double RATEKEEPER_DEFAULT_LIMIT;
double RATEKEEPER_LIMIT_REASON_SAMPLE_RATE;
bool RATEKEEPER_PRINT_LIMIT_REASON;
int64_t TARGET_BYTES_PER_STORAGE_SERVER;
int64_t SPRING_BYTES_STORAGE_SERVER;
@ -631,7 +639,10 @@ public:
double COORDINATOR_LEADER_CONNECTION_TIMEOUT;
// Dynamic Knobs (implementation)
double UPDATE_NODE_TIMEOUT;
double GET_COMMITTED_VERSION_TIMEOUT;
double GET_SNAPSHOT_AND_CHANGES_TIMEOUT;
double FETCH_CHANGES_TIMEOUT;
// Buggification
double BUGGIFIED_EVENTUAL_CONSISTENCY;
@ -739,6 +750,9 @@ public:
// Cluster recovery
std::string CLUSTER_RECOVERY_EVENT_NAME_PREFIX;
// encrypt key proxy
bool ENABLE_ENCRYPT_KEY_PROXY;
// blob granule stuff
// FIXME: configure url with database configuration instead of knob eventually
std::string BG_URL;

View File

@ -2541,7 +2541,7 @@ public:
#define SI_NoCase SI_GenericNoCase
#include <wchar.h>
#include "ConvertUTF.h"
#include "fdbclient/ConvertUTF.h"
/**
* Converts UTF-8 to a wchar_t (or equivalent) using the Unicode reference

View File

@ -823,6 +823,7 @@ ACTOR Future<RangeResult> rwModuleWithMappingGetRangeActor(ReadYourWritesTransac
ExcludeServersRangeImpl::ExcludeServersRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ExcludeServersRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
@ -1001,6 +1002,7 @@ void includeServers(ReadYourWritesTransaction* ryw) {
ryw->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
// includeServers might be used in an emergency transaction, so make sure it is retry-self-conflicting and
// CAUSAL_WRITE_RISKY
ryw->setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY);
@ -1062,6 +1064,7 @@ Future<Optional<std::string>> ExcludeServersRangeImpl::commit(ReadYourWritesTran
FailedServersRangeImpl::FailedServersRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> FailedServersRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
@ -1146,6 +1149,7 @@ Future<RangeResult> ExclusionInProgressRangeImpl::getRange(ReadYourWritesTransac
}
ACTOR Future<RangeResult> getProcessClassActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
std::vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
auto workers = _workers; // strip const
// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
@ -1169,6 +1173,7 @@ ACTOR Future<Optional<std::string>> processClassCommitActor(ReadYourWritesTransa
ryw->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
std::vector<ProcessData> workers = wait(
getWorkers(&ryw->getTransaction())); // make sure we use the Transaction object to avoid used_during_commit()
@ -1288,6 +1293,7 @@ Future<RangeResult> ProcessClassSourceRangeImpl::getRange(ReadYourWritesTransact
ACTOR Future<RangeResult> getLockedKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
RangeResult result;
if (val.present()) {
@ -1318,6 +1324,7 @@ Future<RangeResult> LockDatabaseImpl::getRange(ReadYourWritesTransaction* ryw, K
ACTOR Future<Optional<std::string>> lockDatabaseCommitActor(ReadYourWritesTransaction* ryw, UID uid) {
state Optional<std::string> msg;
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
if (val.present() && BinaryReader::fromStringRef<UID>(val.get().substr(10), Unversioned()) != uid) {
@ -1339,6 +1346,7 @@ ACTOR Future<Optional<std::string>> lockDatabaseCommitActor(ReadYourWritesTransa
ACTOR Future<Optional<std::string>> unlockDatabaseCommitActor(ReadYourWritesTransaction* ryw) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
if (val.present()) {
ryw->getTransaction().clear(singleKeyRange(databaseLockedKey));
@ -1365,6 +1373,7 @@ Future<Optional<std::string>> LockDatabaseImpl::commit(ReadYourWritesTransaction
ACTOR Future<RangeResult> getConsistencyCheckKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(ryw->getTransaction().get(fdbShouldConsistencyCheckBeSuspended));
bool ccSuspendSetting = val.present() ? BinaryReader::fromStringRef<bool>(val.get(), Unversioned()) : false;
@ -1398,6 +1407,7 @@ Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransac
ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("consistencycheck")].second;
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
ryw->getTransaction().set(fdbShouldConsistencyCheckBeSuspended,
BinaryWriter::toValue(entry.present(), Unversioned()));
return Optional<std::string>();
@ -1454,6 +1464,7 @@ void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, co
ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig,
ReadYourWritesTransaction* ryw) {
state Transaction& tr = ryw->getTransaction();
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
// History should only contain three most recent updates. If it currently
// has three items, remove the oldest to make room for a new item.
@ -1724,6 +1735,7 @@ ACTOR static Future<RangeResult> CoordinatorsAutoImplActor(ReadYourWritesTransac
state Transaction& tr = ryw->getTransaction();
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> currentKey = wait(tr.get(coordinatorsKey));
@ -1768,6 +1780,7 @@ Future<RangeResult> CoordinatorsAutoImpl::getRange(ReadYourWritesTransaction* ry
ACTOR static Future<RangeResult> getMinCommitVersionActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(minRequiredCommitVersionKey));
RangeResult result;
if (val.present()) {
@ -1798,6 +1811,7 @@ Future<RangeResult> AdvanceVersionImpl::getRange(ReadYourWritesTransaction* ryw,
ACTOR static Future<Optional<std::string>> advanceVersionCommitActor(ReadYourWritesTransaction* ryw, Version v) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
TraceEvent(SevDebug, "AdvanceVersion").detail("MaxAllowedVersion", maxAllowedVerion);
if (v > maxAllowedVerion) {
return ManagementAPIError::toJsonString(
@ -1816,6 +1830,7 @@ ACTOR static Future<Optional<std::string>> advanceVersionCommitActor(ReadYourWri
}
Future<Optional<std::string>> AdvanceVersionImpl::commit(ReadYourWritesTransaction* ryw) {
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto minCommitVersion =
ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("advanceversion")].second;
if (minCommitVersion.present()) {
@ -1841,6 +1856,9 @@ ACTOR static Future<RangeResult> ClientProfilingGetRangeActor(ReadYourWritesTran
state RangeResult result;
// client_txn_sample_rate
state Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(prefix);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
if (kr.contains(sampleRateKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey];
if (!ryw->readYourWritesDisabled() && entry.first) {
@ -1886,6 +1904,8 @@ Future<RangeResult> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw
}
Future<Optional<std::string>> ClientProfilingImpl::commit(ReadYourWritesTransaction* ryw) {
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
// client_txn_sample_rate
Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(getKeyRange().begin);
auto rateEntry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey];
@ -2248,6 +2268,7 @@ ACTOR static Future<RangeResult> MaintenanceGetRangeActor(ReadYourWritesTransact
state RangeResult result;
// zoneId
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
if (val.present()) {
auto healthyZone = decodeHealthyZoneValue(val.get());
@ -2279,6 +2300,7 @@ Future<RangeResult> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw, Ke
ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
// read
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
Optional<std::pair<Key, Version>> healthyZone =
@ -2342,6 +2364,9 @@ ACTOR static Future<RangeResult> DataDistributionGetRangeActor(ReadYourWritesTra
state RangeResult result;
// dataDistributionModeKey
state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
if (kr.contains(modeKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[modeKey];
if (ryw->readYourWritesDisabled() || !entry.first) {
@ -2375,6 +2400,8 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
// there are two valid keys in the range
// <prefix>/mode -> dataDistributionModeKey, the value is only allowed to be set as "0"(disable) or "1"(enable)
// <prefix>/rebalance_ignored -> rebalanceDDIgnoreKey, value is unused thus empty
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<std::string> msg;
KeyRangeRef kr = getKeyRange();
Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin);
@ -2442,6 +2469,7 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
void includeLocalities(ReadYourWritesTransaction* ryw) {
ryw->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
ryw->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
// includeLocalities might be used in an emergency transaction, so make sure it is retry-self-conflicting and
// CAUSAL_WRITE_RISKY
@ -2522,6 +2550,9 @@ ACTOR Future<Optional<std::string>> excludeLocalityCommitActor(ReadYourWritesTra
state std::unordered_set<std::string> localities;
state std::vector<AddressExclusion> addresses;
state std::set<AddressExclusion> exclusions;
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state std::vector<ProcessData> workers = wait(getWorkers(&ryw->getTransaction()));
if (!parseLocalitiesFromKeys(ryw, failed, localities, addresses, exclusions, workers, result))
return result;
@ -2544,6 +2575,7 @@ ACTOR Future<Optional<std::string>> excludeLocalityCommitActor(ReadYourWritesTra
ExcludedLocalitiesRangeImpl::ExcludedLocalitiesRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ExcludedLocalitiesRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
@ -2570,6 +2602,7 @@ Future<Optional<std::string>> ExcludedLocalitiesRangeImpl::commit(ReadYourWrites
FailedLocalitiesRangeImpl::FailedLocalitiesRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> FailedLocalitiesRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}

View File

@ -228,9 +228,9 @@ const Key storageCacheServerKey(UID id) {
}
const Value storageCacheServerValue(const StorageServerInterface& ssi) {
BinaryWriter wr(IncludeVersion());
wr << ssi;
return wr.toValue();
auto protocolVersion = currentProtocolVersion;
protocolVersion.addObjectSerializerFlag();
return ObjectWriter::toValue(ssi, IncludeVersion(protocolVersion));
}
const KeyRangeRef ddStatsRange = KeyRangeRef(LiteralStringRef("\xff\xff/metrics/data_distribution_stats/"),
@ -369,6 +369,9 @@ UID decodeTssQuarantineKey(KeyRef const& key) {
const KeyRangeRef tssMismatchKeys(LiteralStringRef("\xff/tssMismatch/"), LiteralStringRef("\xff/tssMismatch0"));
const KeyRangeRef serverMetadataKeys(LiteralStringRef("\xff/serverMetadata/"),
LiteralStringRef("\xff/serverMetadata0"));
const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0"));
const KeyRef serverTagPrefix = serverTagKeys.begin;
@ -633,7 +636,10 @@ const KeyRef configKeysPrefix = configKeys.begin;
const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle"));
const KeyRef perpetualStorageWiggleLocalityKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle_locality"));
const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWigglePID"));
const KeyRef perpetualStorageWiggleIDPrefix(
LiteralStringRef("\xff/storageWiggleID/")); // withSuffix /primary or /remote
const KeyRef perpetualStorageWiggleStatsPrefix(
LiteralStringRef("\xff/storageWiggleStats/")); // withSuffix /primary or /remote
const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint"));
@ -1027,16 +1033,6 @@ std::pair<Key, Version> decodeHealthyZoneValue(ValueRef const& value) {
return std::make_pair(zoneId, version);
}
const KeyRangeRef clientLibMetadataKeys(LiteralStringRef("\xff\x02/clientlib/meta/"),
LiteralStringRef("\xff\x02/clientlib/meta0"));
const KeyRef clientLibMetadataPrefix = clientLibMetadataKeys.begin;
const KeyRangeRef clientLibBinaryKeys(LiteralStringRef("\xff\x02/clientlib/bin/"),
LiteralStringRef("\xff\x02/clientlib/bin0"));
const KeyRef clientLibBinaryPrefix = clientLibBinaryKeys.begin;
const KeyRef clientLibChangeCounterKey = "\xff\x02/clientlib/changeCounter"_sr;
const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTONLYtxnStateStore/"),
LiteralStringRef("\xff/TESTONLYtxnStateStore0"));

View File

@ -131,6 +131,10 @@ UID decodeTssQuarantineKey(KeyRef const&);
// For recording tss mismatch details in the system keyspace
extern const KeyRangeRef tssMismatchKeys;
// \xff/serverMetadata/[[storageInterfaceUID]] = [[StorageMetadataType]]
// Note: storageInterfaceUID is the one stated in the file name
extern const KeyRangeRef serverMetadataKeys;
// "\xff/serverTag/[[serverID]]" = "[[Tag]]"
// Provides the Tag for the given serverID. Used to access a
// storage server's corresponding TLog in order to apply mutations.
@ -214,7 +218,9 @@ extern const KeyRef configKeysPrefix;
extern const KeyRef perpetualStorageWiggleKey;
extern const KeyRef perpetualStorageWiggleLocalityKey;
extern const KeyRef wigglingStorageServerKey;
extern const KeyRef perpetualStorageWiggleIDPrefix;
extern const KeyRef perpetualStorageWiggleStatsPrefix;
// Change the value of this key to anything and that will trigger detailed data distribution team info log.
extern const KeyRef triggerDDTeamInfoPrintKey;
@ -326,7 +332,7 @@ extern const KeyRef backupPausedKey;
extern const KeyRef coordinatorsKey;
// "\xff/logs" = "[[LogsValue]]"
// Used during master recovery in order to communicate
// Used during cluster recovery in order to communicate
// and store info about the logs system.
extern const KeyRef logsKey;
@ -482,16 +488,6 @@ extern const KeyRef rebalanceDDIgnoreKey;
const Value healthyZoneValue(StringRef const& zoneId, Version version);
std::pair<Key, Version> decodeHealthyZoneValue(ValueRef const&);
// Key ranges reserved for storing client library binaries and respective
// json documents with the metadata describing the libaries
extern const KeyRangeRef clientLibMetadataKeys;
extern const KeyRef clientLibMetadataPrefix;
extern const KeyRangeRef clientLibBinaryKeys;
extern const KeyRef clientLibBinaryPrefix;
extern const KeyRef clientLibChangeCounterKey;
// All mutations done to this range are blindly copied into txnStateStore.
// Used to create artifically large txnStateStore instances in testing.
extern const KeyRangeRef testOnlyTxnStateStorePrefixRange;

View File

@ -189,7 +189,7 @@ description is not currently required but encouraged.
description="The read version will be committed, and usually will be the latest committed, but might not be the latest committed in the event of a simultaneous fault and misbehaving clock."
defaultFor="20"/>
<Option name="transaction_include_port_in_address" code="505"
description="Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
defaultFor="23"/>
<Option name="transaction_bypass_unreadable" code="700"
description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."

View File

@ -1,4 +1,4 @@
set(FDBMONITOR_SRCS ConvertUTF.h SimpleIni.h fdbmonitor.cpp)
set(FDBMONITOR_SRCS fdbmonitor.cpp)
add_executable(fdbmonitor ${FDBMONITOR_SRCS})
strip_debug_symbols(fdbmonitor)

View File

@ -76,8 +76,8 @@
#include <grp.h>
#include "flow/SimpleOpt.h"
#include "SimpleIni.h"
#include "fdbclient/SimpleIni.h"
#include "fdbclient/versions.h"
#ifdef __linux__

View File

@ -50,7 +50,8 @@ public:
state unsigned char* encrypted = new (arena) unsigned char[FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE];
int bytes = wait(
self->file->read(encrypted, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE * block));
DecryptionStreamCipher decryptor(StreamCipher::Key::getKey(), self->getIV(block));
StreamCipherKey const* cipherKey = StreamCipherKey::getGlobalCipherKey();
DecryptionStreamCipher decryptor(cipherKey, self->getIV(block));
auto decrypted = decryptor.decrypt(encrypted, bytes, arena);
return Standalone<StringRef>(decrypted, arena);
}
@ -114,7 +115,7 @@ public:
self->offsetInBlock = 0;
ASSERT_LT(self->currentBlock, std::numeric_limits<uint32_t>::max());
++self->currentBlock;
self->encryptor = std::make_unique<EncryptionStreamCipher>(StreamCipher::Key::getKey(),
self->encryptor = std::make_unique<EncryptionStreamCipher>(StreamCipherKey::getGlobalCipherKey(),
self->getIV(self->currentBlock));
}
}
@ -143,7 +144,8 @@ AsyncFileEncrypted::AsyncFileEncrypted(Reference<IAsyncFile> file, Mode mode)
: file(file), mode(mode), readBuffers(FLOW_KNOBS->MAX_DECRYPTED_BLOCKS), currentBlock(0) {
firstBlockIV = AsyncFileEncryptedImpl::getFirstBlockIV(file->getFilename());
if (mode == Mode::APPEND_ONLY) {
encryptor = std::make_unique<EncryptionStreamCipher>(StreamCipher::Key::getKey(), getIV(currentBlock));
encryptor =
std::make_unique<EncryptionStreamCipher>(StreamCipherKey::getGlobalCipherKey(), getIV(currentBlock));
writeBuffer = std::vector<unsigned char>(FLOW_KNOBS->ENCRYPTION_BLOCK_SIZE, 0);
}
}
@ -259,7 +261,7 @@ TEST_CASE("fdbrpc/AsyncFileEncrypted") {
generateRandomData(&writeBuffer.front(), bytes);
state std::vector<unsigned char> readBuffer(bytes, 0);
ASSERT(g_network->isSimulated());
StreamCipher::Key::initializeRandomTestKey();
StreamCipherKey::initializeGlobalRandomTestKey();
int flags = IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE |
IAsyncFile::OPEN_UNBUFFERED | IAsyncFile::OPEN_ENCRYPTED | IAsyncFile::OPEN_UNCACHED |
IAsyncFile::OPEN_NO_AIO;

View File

@ -117,7 +117,7 @@ const Endpoint& EndpointMap::insert(NetworkAddressList localAddresses,
int adjacentFree = 0;
int adjacentStart = -1;
firstFree = -1;
for (int i = 0; i < data.size(); i++) {
for (int i = wellKnownEndpointCount; i < data.size(); i++) {
if (data[i].receiver) {
adjacentFree = 0;
} else {
@ -1205,8 +1205,8 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
FLOW_KNOBS->CONNECTION_REJECTED_MESSAGE_DELAY) {
TraceEvent(SevWarn, "ConnectionRejected", conn->getDebugID())
.detail("Reason", "IncompatibleProtocolVersion")
.detail("LocalVersion", g_network->protocolVersion().version())
.detail("RejectedVersion", pkt.protocolVersion.version())
.detail("LocalVersion", g_network->protocolVersion())
.detail("RejectedVersion", pkt.protocolVersion)
.detail("Peer",
pkt.canonicalRemotePort
? NetworkAddress(pkt.canonicalRemoteIp(), pkt.canonicalRemotePort)

View File

@ -272,6 +272,24 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
default:
return ProcessClass::NeverAssign;
}
case ProcessClass::EncryptKeyProxy:
switch (_class) {
case ProcessClass::EncryptKeyProxyClass:
return ProcessClass::BestFit;
case ProcessClass::StatelessClass:
return ProcessClass::GoodFit;
case ProcessClass::UnsetClass:
return ProcessClass::UnsetFit;
case ProcessClass::MasterClass:
return ProcessClass::OkayFit;
case ProcessClass::CoordinatorClass:
case ProcessClass::TesterClass:
case ProcessClass::StorageCacheClass:
case ProcessClass::BlobWorkerClass:
return ProcessClass::NeverAssign;
default:
return ProcessClass::WorstFit;
}
default:
return ProcessClass::NeverAssign;
}

View File

@ -48,6 +48,7 @@ struct ProcessClass {
GrvProxyClass,
BlobManagerClass,
BlobWorkerClass,
EncryptKeyProxyClass,
InvalidClass = -1
};
@ -75,6 +76,7 @@ struct ProcessClass {
BlobWorker,
StorageCache,
Backup,
EncryptKeyProxy,
Worker, // used for actor lineage tracking
NoRole
};
@ -112,6 +114,7 @@ public:
else if (s=="blob_worker") _class = BlobWorkerClass;
else if (s=="storage_cache") _class = StorageCacheClass;
else if (s=="backup") _class = BackupClass;
else if (s=="encrypt_key_proxy") _class = EncryptKeyProxyClass;
else _class = InvalidClass;
}
@ -141,6 +144,7 @@ public:
else if (classStr=="blob_worker") _class = BlobWorkerClass;
else if (classStr=="storage_cache") _class = StorageCacheClass;
else if (classStr=="backup") _class = BackupClass;
else if (classStr=="encrypt_key_proxy") _class = EncryptKeyProxyClass;
else _class = InvalidClass;
if (sourceStr=="command_line") _source = CommandLineSource;
@ -180,6 +184,7 @@ public:
case BlobWorkerClass: return "blob_worker";
case StorageCacheClass: return "storage_cache";
case BackupClass: return "backup";
case EncryptKeyProxyClass: return "encrypt_key_proxy";
default: return "invalid";
}
}

View File

@ -108,6 +108,56 @@ void MockDNS::clearMockTCPEndpoints() {
hostnameToAddresses.clear();
}
std::string MockDNS::toString() {
std::string ret;
for (auto it = hostnameToAddresses.begin(); it != hostnameToAddresses.end(); ++it) {
if (it != hostnameToAddresses.begin()) {
ret += ';';
}
ret += it->first + ',';
const std::vector<NetworkAddress>& addresses = it->second;
for (int i = 0; i < addresses.size(); ++i) {
ret += addresses[i].toString();
if (i != addresses.size() - 1) {
ret += ',';
}
}
}
return ret;
}
MockDNS MockDNS::parseFromString(const std::string& s) {
std::map<std::string, std::vector<NetworkAddress>> mockDNS;
for (int p = 0; p < s.length();) {
int pSemiColumn = s.find_first_of(';', p);
if (pSemiColumn == s.npos) {
pSemiColumn = s.length();
}
std::string oneMapping = s.substr(p, pSemiColumn - p);
std::string hostname;
std::vector<NetworkAddress> addresses;
for (int i = 0; i < oneMapping.length();) {
int pComma = oneMapping.find_first_of(',', i);
if (pComma == oneMapping.npos) {
pComma = oneMapping.length();
}
if (!i) {
// The first part is hostname
hostname = oneMapping.substr(i, pComma - i);
} else {
addresses.push_back(NetworkAddress::parse(oneMapping.substr(i, pComma - i)));
}
i = pComma + 1;
}
mockDNS[hostname] = addresses;
p = pSemiColumn + 1;
}
return MockDNS(mockDNS);
}
void SimExternalConnection::close() {
socket.close();
}
@ -171,8 +221,8 @@ UID SimExternalConnection::getDebugID() const {
return dbgid;
}
ACTOR static Future<std::vector<NetworkAddress>> resolveTCPEndpointImpl(std::string host, std::string service) {
wait(delayJittered(0.1));
std::vector<NetworkAddress> SimExternalConnection::resolveTCPEndpointBlocking(const std::string& host,
const std::string& service) {
ip::tcp::resolver resolver(ios);
ip::tcp::resolver::query query(host, service);
auto iter = resolver.resolve(query);
@ -191,6 +241,11 @@ ACTOR static Future<std::vector<NetworkAddress>> resolveTCPEndpointImpl(std::str
return addrs;
}
ACTOR static Future<std::vector<NetworkAddress>> resolveTCPEndpointImpl(std::string host, std::string service) {
wait(delayJittered(0.1));
return SimExternalConnection::resolveTCPEndpointBlocking(host, service);
}
Future<std::vector<NetworkAddress>> SimExternalConnection::resolveTCPEndpoint(const std::string& host,
const std::string& service) {
return resolveTCPEndpointImpl(host, service);
@ -253,17 +308,17 @@ TEST_CASE("fdbrpc/SimExternalClient") {
return Void();
}
TEST_CASE("fdbrpc/MockTCPEndpoints") {
TEST_CASE("fdbrpc/MockDNS") {
state MockDNS mockDNS;
state std::vector<NetworkAddress> networkAddresses;
state NetworkAddress address1(IPAddress(0x13131313), 1);
state NetworkAddress address2(IPAddress(0x14141414), 2);
networkAddresses.push_back(address1);
networkAddresses.push_back(address2);
mockDNS.addMockTCPEndpoint("testhost1", "testport1", networkAddresses);
ASSERT(mockDNS.findMockTCPEndpoint("testhost1", "testport1"));
ASSERT(mockDNS.findMockTCPEndpoint("testhost1", "testport2") == false);
std::vector<NetworkAddress> resolvedNetworkAddresses = mockDNS.getTCPEndpoint("testhost1", "testport1");
mockDNS.addMockTCPEndpoint("testhost1", "port1", networkAddresses);
ASSERT(mockDNS.findMockTCPEndpoint("testhost1", "port1"));
ASSERT(!mockDNS.findMockTCPEndpoint("testhost1", "port2"));
std::vector<NetworkAddress> resolvedNetworkAddresses = mockDNS.getTCPEndpoint("testhost1", "port1");
ASSERT(resolvedNetworkAddresses.size() == 2);
ASSERT(std::find(resolvedNetworkAddresses.begin(), resolvedNetworkAddresses.end(), address1) !=
resolvedNetworkAddresses.end());
@ -271,26 +326,26 @@ TEST_CASE("fdbrpc/MockTCPEndpoints") {
resolvedNetworkAddresses.end());
// Adding a hostname twice should fail.
try {
mockDNS.addMockTCPEndpoint("testhost1", "testport1", networkAddresses);
mockDNS.addMockTCPEndpoint("testhost1", "port1", networkAddresses);
} catch (Error& e) {
ASSERT(e.code() == error_code_operation_failed);
}
// Updating an unexisted hostname should fail.
try {
mockDNS.updateMockTCPEndpoint("testhost2", "testport2", networkAddresses);
mockDNS.updateMockTCPEndpoint("testhost2", "port2", networkAddresses);
} catch (Error& e) {
ASSERT(e.code() == error_code_operation_failed);
}
// Removing an unexisted hostname should fail.
try {
mockDNS.removeMockTCPEndpoint("testhost2", "testport2");
mockDNS.removeMockTCPEndpoint("testhost2", "port2");
} catch (Error& e) {
ASSERT(e.code() == error_code_operation_failed);
}
mockDNS.clearMockTCPEndpoints();
// Updating any hostname right after clearing endpoints should fail.
try {
mockDNS.updateMockTCPEndpoint("testhost1", "testport1", networkAddresses);
mockDNS.updateMockTCPEndpoint("testhost1", "port1", networkAddresses);
} catch (Error& e) {
ASSERT(e.code() == error_code_operation_failed);
}
@ -298,4 +353,55 @@ TEST_CASE("fdbrpc/MockTCPEndpoints") {
return Void();
}
TEST_CASE("fdbrpc/MockTCPEndpoints") {
state std::vector<NetworkAddress> networkAddresses;
state NetworkAddress address1(IPAddress(0x13131313), 1);
networkAddresses.push_back(address1);
INetworkConnections::net()->addMockTCPEndpoint("testhost1", "port1", networkAddresses);
state std::vector<NetworkAddress> resolvedNetworkAddresses =
wait(INetworkConnections::net()->resolveTCPEndpoint("testhost1", "port1"));
ASSERT(resolvedNetworkAddresses.size() == 1);
ASSERT(std::find(resolvedNetworkAddresses.begin(), resolvedNetworkAddresses.end(), address1) !=
resolvedNetworkAddresses.end());
// Adding a hostname twice should fail.
try {
INetworkConnections::net()->addMockTCPEndpoint("testhost1", "port1", networkAddresses);
} catch (Error& e) {
ASSERT(e.code() == error_code_operation_failed);
}
// Removing an unexisted hostname should fail.
try {
INetworkConnections::net()->removeMockTCPEndpoint("testhost2", "port2");
} catch (Error& e) {
ASSERT(e.code() == error_code_operation_failed);
}
INetworkConnections::net()->removeMockTCPEndpoint("testhost1", "port1");
state NetworkAddress address2(IPAddress(0x14141414), 2);
networkAddresses.push_back(address2);
INetworkConnections::net()->addMockTCPEndpoint("testhost1", "port1", networkAddresses);
wait(store(resolvedNetworkAddresses, INetworkConnections::net()->resolveTCPEndpoint("testhost1", "port1")));
ASSERT(resolvedNetworkAddresses.size() == 2);
ASSERT(std::find(resolvedNetworkAddresses.begin(), resolvedNetworkAddresses.end(), address2) !=
resolvedNetworkAddresses.end());
return Void();
}
TEST_CASE("fdbrpc/MockDNSParsing") {
std::string mockDNSString;
INetworkConnections::net()->parseMockDNSFromString(mockDNSString);
ASSERT(INetworkConnections::net()->convertMockDNSToString() == mockDNSString);
mockDNSString = "testhost1:port1,[::1]:4800:tls(fromHostname)";
INetworkConnections::net()->parseMockDNSFromString(mockDNSString);
ASSERT(INetworkConnections::net()->convertMockDNSToString() == mockDNSString);
mockDNSString = "testhost1:port1,[::1]:4800,[2001:db8:85a3::8a2e:370:7334]:4800;testhost2:port2,[2001:"
"db8:85a3::8a2e:370:7334]:4800:tls(fromHostname),8.8.8.8:12";
INetworkConnections::net()->parseMockDNSFromString(mockDNSString);
ASSERT(INetworkConnections::net()->convertMockDNSToString() == mockDNSString);
return Void();
}
void forceLinkSimExternalConnectionTests() {}

View File

@ -31,6 +31,10 @@
// MockDNS is a class maintaining a <hostname, vector<NetworkAddress>> mapping, mocking a DNS in simulation.
class MockDNS {
public:
MockDNS() {}
explicit MockDNS(const std::map<std::string, std::vector<NetworkAddress>>& mockDNS)
: hostnameToAddresses(mockDNS) {}
bool findMockTCPEndpoint(const std::string& host, const std::string& service);
void addMockTCPEndpoint(const std::string& host,
const std::string& service,
@ -42,6 +46,12 @@ public:
void clearMockTCPEndpoints();
std::vector<NetworkAddress> getTCPEndpoint(const std::string& host, const std::string& service);
void operator=(MockDNS const& rhs) { hostnameToAddresses = rhs.hostnameToAddresses; }
// Convert hostnameToAddresses to string. The format is:
// hostname1,host1Address1,host1Address2;hostname2,host2Address1,host2Address2...
std::string toString();
static MockDNS parseFromString(const std::string& s);
private:
std::map<std::string, std::vector<NetworkAddress>> hostnameToAddresses;
};
@ -67,6 +77,7 @@ public:
NetworkAddress getPeerAddress() const override;
UID getDebugID() const override;
static Future<std::vector<NetworkAddress>> resolveTCPEndpoint(const std::string& host, const std::string& service);
static std::vector<NetworkAddress> resolveTCPEndpointBlocking(const std::string& host, const std::string& service);
static Future<Reference<IConnection>> connect(NetworkAddress toAddr);
};

View File

@ -945,6 +945,13 @@ public:
const std::vector<NetworkAddress>& addresses) override {
mockDNS.addMockTCPEndpoint(host, service, addresses);
}
void removeMockTCPEndpoint(const std::string& host, const std::string& service) override {
mockDNS.removeMockTCPEndpoint(host, service);
}
// Convert hostnameToAddresses from/to string. The format is:
// hostname1,host1Address1,host1Address2;hostname2,host2Address1,host2Address2...
void parseMockDNSFromString(const std::string& s) override { mockDNS = MockDNS::parseFromString(s); }
std::string convertMockDNSToString() override { return mockDNS.toString(); }
Future<std::vector<NetworkAddress>> resolveTCPEndpoint(const std::string& host,
const std::string& service) override {
// If a <hostname, vector<NetworkAddress>> pair was injected to mock DNS, use it.
@ -953,6 +960,14 @@ public:
}
return SimExternalConnection::resolveTCPEndpoint(host, service);
}
std::vector<NetworkAddress> resolveTCPEndpointBlocking(const std::string& host,
const std::string& service) override {
// If a <hostname, vector<NetworkAddress>> pair was injected to mock DNS, use it.
if (mockDNS.findMockTCPEndpoint(host, service)) {
return mockDNS.getTCPEndpoint(host, service);
}
return SimExternalConnection::resolveTCPEndpointBlocking(host, service);
}
ACTOR static Future<Reference<IConnection>> onConnect(Future<Void> ready, Reference<Sim2Conn> conn) {
wait(ready);
if (conn->isPeerGone()) {

View File

@ -161,6 +161,8 @@ public:
return false;
case ProcessClass::BackupClass:
return false;
case ProcessClass::EncryptKeyProxyClass:
return false;
default:
return false;
}

View File

@ -31,7 +31,11 @@ set(FDBSERVER_SRCS
DataDistributionTracker.actor.cpp
DataDistributorInterface.h
DBCoreState.h
DDTeamCollection.actor.cpp
DDTeamCollection.h
DiskQueue.actor.cpp
EncryptKeyProxyInterface.h
EncryptKeyProxy.actor.cpp
fdbserver.actor.cpp
FDBExecHelper.actor.cpp
FDBExecHelper.actor.h
@ -127,6 +131,8 @@ set(FDBSERVER_SRCS
TagPartitionedLogSystem.actor.cpp
TagPartitionedLogSystem.actor.h
template_fdb.h
TCInfo.actor.cpp
TCInfo.h
tester.actor.cpp
TesterInterface.actor.h
TLogInterface.h
@ -168,7 +174,6 @@ set(FDBSERVER_SRCS
workloads/Cache.actor.cpp
workloads/ChangeConfig.actor.cpp
workloads/ClearSingleRange.actor.cpp
workloads/ClientLibManagementWorkload.actor.cpp
workloads/ClientTransactionProfileCorrectness.actor.cpp
workloads/TriggerRecovery.actor.cpp
workloads/SuspendProcesses.actor.cpp
@ -188,6 +193,7 @@ set(FDBSERVER_SRCS
workloads/DiskDurabilityTest.actor.cpp
workloads/DiskFailureInjection.actor.cpp
workloads/DummyWorkload.actor.cpp
workloads/EncryptionOps.actor.cpp
workloads/ExternalWorkload.actor.cpp
workloads/FastTriggeredWatches.actor.cpp
workloads/FileSystem.actor.cpp
@ -195,6 +201,7 @@ set(FDBSERVER_SRCS
workloads/FuzzApiCorrectness.actor.cpp
workloads/GetRangeStream.actor.cpp
workloads/HealthMetricsApi.actor.cpp
workloads/HighContentionPrefixAllocatorWorkload.actor.cpp
workloads/IncrementalBackup.actor.cpp
workloads/Increment.actor.cpp
workloads/IndexScan.actor.cpp
@ -291,14 +298,15 @@ add_library(fdb_sqlite STATIC
if (WITH_ROCKSDB_EXPERIMENTAL)
add_definitions(-DSSD_ROCKSDB_EXPERIMENTAL)
# Set this to 0 if you want to compile RocksDB with `-march=native`.
set(PORTABLE_ROCKSDB 1)
include(CompileRocksDB)
# CompileRocksDB sets `lz4_LIBRARIES` to be the shared lib, we want to link
# statically, so find the static library here.
find_library(lz4_STATIC_LIBRARIES
NAMES liblz4.a REQUIRED)
if (WITH_LIBURING)
find_package(uring)
endif()
endif()
# Suppress warnings in sqlite since it's third party
@ -318,8 +326,15 @@ target_include_directories(fdbserver PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/workloads)
if (WITH_ROCKSDB_EXPERIMENTAL)
add_dependencies(fdbserver rocksdb)
if(WITH_LIBURING)
target_include_directories(fdbserver PRIVATE ${ROCKSDB_INCLUDE_DIR} ${uring_INCLUDE_DIR})
target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite ${ROCKSDB_LIBRARIES} ${uring_LIBRARIES} ${lz4_STATIC_LIBRARIES})
target_compile_definitions(fdbserver PRIVATE BOOST_ASIO_HAS_IO_URING=1 BOOST_ASIO_DISABLE_EPOLL=1)
else()
target_include_directories(fdbserver PRIVATE ${ROCKSDB_INCLUDE_DIR})
target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite ${ROCKSDB_LIBRARIES} ${lz4_STATIC_LIBRARIES})
target_compile_definitions(fdbserver PRIVATE)
endif()
else()
target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite)
endif()

Some files were not shown because too many files have changed in this diff Show More