Merge remote-tracking branch 'origin/main' into expose-txn-cost

This commit is contained in:
sfc-gh-tclinkenbeard 2022-10-30 09:36:37 -07:00
commit 0eb1598afa
143 changed files with 4499 additions and 2178 deletions

View File

@ -442,7 +442,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
DEPENDS ${IMPLIBSO_SRC} fdb_c
COMMENT "Generating source code for C shim library")
add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack")
target_link_libraries(fdb_c_shim PUBLIC dl)
target_include_directories(fdb_c_shim PUBLIC

View File

@ -59,6 +59,8 @@
#include "shm.hpp"
#include "stats.hpp"
#include "time.hpp"
#include "rapidjson/document.h"
#include "rapidjson/error/en.h"
namespace mako {
@ -88,14 +90,29 @@ Transaction createNewTransaction(Database db, Arguments const& args, int id = -1
}
// Create Tenant Transaction
int tenant_id = (id == -1) ? urand(0, args.active_tenants - 1) : id;
Transaction tr;
std::string tenantStr;
// If provided tenants array, use it
if (tenants) {
return tenants[tenant_id].createTransaction();
tr = tenants[tenant_id].createTransaction();
} else {
tenantStr = "tenant" + std::to_string(tenant_id);
BytesRef tenant_name = toBytesRef(tenantStr);
Tenant t = db.openTenant(tenant_name);
tr = t.createTransaction();
}
std::string tenantStr = "tenant" + std::to_string(tenant_id);
BytesRef tenant_name = toBytesRef(tenantStr);
Tenant t = db.openTenant(tenant_name);
return t.createTransaction();
if (!args.authorization_tokens.empty()) {
// lookup token based on tenant name and, if found, set authz token to transaction
if (tenantStr.empty())
tenantStr = "tenant" + std::to_string(tenant_id);
auto tokenMapItr = args.authorization_tokens.find(tenantStr);
if (tokenMapItr != args.authorization_tokens.end()) {
tr.setOption(FDB_TR_OPTION_AUTHORIZATION_TOKEN, tokenMapItr->second);
} else {
logr.warn("Authorization token map is not empty, but could not find token for tenant '{}'", tenantStr);
}
}
return tr;
}
uint64_t byteswapHelper(uint64_t input) {
@ -815,6 +832,18 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
logr.error("network::setOption(FDB_NET_OPTION_DISTRIBUTED_CLIENT_TRACER): {}", err.what());
}
if (args.tls_certificate_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_CERT_PATH, args.tls_certificate_file.value());
}
if (args.tls_key_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_KEY_PATH, args.tls_key_file.value());
}
if (args.tls_ca_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_CA_PATH, args.tls_ca_file.value());
}
/* enable flatbuffers if specified */
if (args.flatbuffers) {
#ifdef FDB_NET_OPTION_USE_FLATBUFFERS
@ -982,57 +1011,55 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
}
/* initialize the parameters with default values */
int initArguments(Arguments& args) {
memset(&args, 0, sizeof(Arguments)); /* zero-out everything */
args.num_fdb_clusters = 0;
args.num_databases = 1;
args.api_version = maxApiVersion();
args.json = 0;
args.num_processes = 1;
args.num_threads = 1;
args.async_xacts = 0;
args.mode = MODE_INVALID;
args.rows = 100000;
args.load_factor = 1.0;
args.row_digits = digits(args.rows);
args.seconds = 30;
args.iteration = 0;
args.tpsmax = 0;
args.tpsmin = -1;
args.tpsinterval = 10;
args.tpschange = TPS_SIN;
args.sampling = 1000;
args.key_length = 32;
args.value_length = 16;
args.active_tenants = 0;
args.total_tenants = 0;
args.tenant_batch_size = 10000;
args.zipf = 0;
args.commit_get = 0;
args.verbose = 1;
args.flatbuffers = 0; /* internal */
args.knobs[0] = '\0';
args.log_group[0] = '\0';
args.prefixpadding = 0;
args.trace = 0;
args.tracepath[0] = '\0';
args.traceformat = 0; /* default to client's default (XML) */
args.streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
args.txntrace = 0;
args.txntagging = 0;
memset(args.txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
Arguments::Arguments() {
num_fdb_clusters = 0;
num_databases = 1;
api_version = maxApiVersion();
json = 0;
num_processes = 1;
num_threads = 1;
async_xacts = 0;
mode = MODE_INVALID;
rows = 100000;
load_factor = 1.0;
row_digits = digits(rows);
seconds = 30;
iteration = 0;
tpsmax = 0;
tpsmin = -1;
tpsinterval = 10;
tpschange = TPS_SIN;
sampling = 1000;
key_length = 32;
value_length = 16;
active_tenants = 0;
total_tenants = 0;
tenant_batch_size = 10000;
zipf = 0;
commit_get = 0;
verbose = 1;
flatbuffers = 0; /* internal */
knobs[0] = '\0';
log_group[0] = '\0';
prefixpadding = 0;
trace = 0;
tracepath[0] = '\0';
traceformat = 0; /* default to client's default (XML) */
streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
txntrace = 0;
txntagging = 0;
memset(txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
for (auto i = 0; i < MAX_OP; i++) {
args.txnspec.ops[i][OP_COUNT] = 0;
txnspec.ops[i][OP_COUNT] = 0;
}
args.client_threads_per_version = 0;
args.disable_client_bypass = false;
args.disable_ryw = 0;
args.json_output_path[0] = '\0';
args.stats_export_path[0] = '\0';
args.bg_materialize_files = false;
args.bg_file_path[0] = '\0';
args.distributed_tracer_client = 0;
return 0;
client_threads_per_version = 0;
disable_client_bypass = false;
disable_ryw = 0;
json_output_path[0] = '\0';
stats_export_path[0] = '\0';
bg_materialize_files = false;
bg_file_path[0] = '\0';
distributed_tracer_client = 0;
}
/* parse transaction specification */
@ -1279,6 +1306,10 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "bg_file_path", required_argument, NULL, ARG_BG_FILE_PATH },
{ "stats_export_path", optional_argument, NULL, ARG_EXPORT_PATH },
{ "distributed_tracer_client", required_argument, NULL, ARG_DISTRIBUTED_TRACER_CLIENT },
{ "tls_certificate_file", required_argument, NULL, ARG_TLS_CERTIFICATE_FILE },
{ "tls_key_file", required_argument, NULL, ARG_TLS_KEY_FILE },
{ "tls_ca_file", required_argument, NULL, ARG_TLS_CA_FILE },
{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
{ NULL, 0, NULL, 0 }
};
idx = 0;
@ -1515,6 +1546,45 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.distributed_tracer_client = -1;
}
break;
case ARG_TLS_CERTIFICATE_FILE:
args.tls_certificate_file = std::string(optarg);
break;
case ARG_TLS_KEY_FILE:
args.tls_key_file = std::string(optarg);
break;
case ARG_TLS_CA_FILE:
args.tls_ca_file = std::string(optarg);
break;
case ARG_AUTHORIZATION_TOKEN_FILE: {
std::string tokenFilename(optarg);
std::ifstream ifs(tokenFilename);
std::ostringstream oss;
oss << ifs.rdbuf();
rapidjson::Document d;
d.Parse(oss.str().c_str());
if (d.HasParseError()) {
logr.error("Failed to parse authorization token JSON file '{}': {} at offset {}",
tokenFilename,
GetParseError_En(d.GetParseError()),
d.GetErrorOffset());
return -1;
} else if (!d.IsObject()) {
logr.error("Authorization token JSON file '{}' must contain a JSON object", tokenFilename);
return -1;
}
for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
if (!itr->value.IsString()) {
logr.error("Token '{}' is not a string", itr->name.GetString());
return -1;
}
args.authorization_tokens.insert_or_assign(
std::string(itr->name.GetString(), itr->name.GetStringLength()),
std::string(itr->value.GetString(), itr->value.GetStringLength()));
}
logr.info("Added {} tenant authorization tokens to map from file '{}'",
args.authorization_tokens.size(),
tokenFilename);
} break;
}
}
@ -1525,93 +1595,97 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
return 0;
}
int validateArguments(Arguments const& args) {
if (args.mode == MODE_INVALID) {
int Arguments::validate() {
if (mode == MODE_INVALID) {
logr.error("--mode has to be set");
return -1;
}
if (args.verbose < VERBOSE_NONE || args.verbose > VERBOSE_DEBUG) {
if (verbose < VERBOSE_NONE || verbose > VERBOSE_DEBUG) {
logr.error("--verbose must be between 0 and 3");
return -1;
}
if (args.rows <= 0) {
if (rows <= 0) {
logr.error("--rows must be a positive integer");
return -1;
}
if (args.load_factor <= 0 || args.load_factor > 1) {
if (load_factor <= 0 || load_factor > 1) {
logr.error("--load_factor must be in range (0, 1]");
return -1;
}
if (args.key_length < 0) {
if (key_length < 0) {
logr.error("--keylen must be a positive integer");
return -1;
}
if (args.value_length < 0) {
if (value_length < 0) {
logr.error("--vallen must be a positive integer");
return -1;
}
if (args.num_fdb_clusters > NUM_CLUSTERS_MAX) {
if (num_fdb_clusters > NUM_CLUSTERS_MAX) {
logr.error("Mako is not supported to do work to more than {} clusters", NUM_CLUSTERS_MAX);
return -1;
}
if (args.num_databases > NUM_DATABASES_MAX) {
if (num_databases > NUM_DATABASES_MAX) {
logr.error("Mako is not supported to do work to more than {} databases", NUM_DATABASES_MAX);
return -1;
}
if (args.num_databases < args.num_fdb_clusters) {
logr.error("--num_databases ({}) must be >= number of clusters({})", args.num_databases, args.num_fdb_clusters);
if (num_databases < num_fdb_clusters) {
logr.error("--num_databases ({}) must be >= number of clusters({})", num_databases, num_fdb_clusters);
return -1;
}
if (args.num_threads < args.num_databases) {
logr.error("--threads ({}) must be >= number of databases ({})", args.num_threads, args.num_databases);
if (num_threads < num_databases) {
logr.error("--threads ({}) must be >= number of databases ({})", num_threads, num_databases);
return -1;
}
if (args.key_length < 4 /* "mako" */ + args.row_digits) {
if (key_length < 4 /* "mako" */ + row_digits) {
logr.error("--keylen must be larger than {} to store \"mako\" prefix "
"and maximum row number",
4 + args.row_digits);
4 + row_digits);
return -1;
}
if (args.active_tenants > args.total_tenants) {
if (active_tenants > total_tenants) {
logr.error("--active_tenants must be less than or equal to --total_tenants");
return -1;
}
if (args.tenant_batch_size < 1) {
if (tenant_batch_size < 1) {
logr.error("--tenant_batch_size must be at least 1");
return -1;
}
if (args.mode == MODE_RUN) {
if ((args.seconds > 0) && (args.iteration > 0)) {
if (mode == MODE_RUN) {
if ((seconds > 0) && (iteration > 0)) {
logr.error("Cannot specify seconds and iteration together");
return -1;
}
if ((args.seconds == 0) && (args.iteration == 0)) {
if ((seconds == 0) && (iteration == 0)) {
logr.error("Must specify either seconds or iteration");
return -1;
}
if (args.txntagging < 0) {
if (txntagging < 0) {
logr.error("--txntagging must be a non-negative integer");
return -1;
}
}
// ensure that all of the files provided to mako are valid and exist
if (args.mode == MODE_REPORT) {
if (!args.num_report_files) {
if (mode == MODE_REPORT) {
if (!num_report_files) {
logr.error("No files to merge");
}
for (int i = 0; i < args.num_report_files; i++) {
for (int i = 0; i < num_report_files; i++) {
struct stat buffer;
if (stat(args.report_files[i], &buffer) != 0) {
logr.error("Couldn't open file {}", args.report_files[i]);
if (stat(report_files[i], &buffer) != 0) {
logr.error("Couldn't open file {}", report_files[i]);
return -1;
}
}
}
if (args.distributed_tracer_client < 0) {
logr.error("--disibuted_tracer_client must specify either (disabled, network_lossy, log_file)");
if (distributed_tracer_client < 0) {
logr.error("--distributed_tracer_client must specify either (disabled, network_lossy, log_file)");
return -1;
}
if (!authorization_tokens.empty() && !tls_ca_file.has_value()) {
logr.warn("Authorization tokens are being used without explicit TLS CA file configured");
}
return 0;
}
@ -2262,11 +2336,6 @@ int main(int argc, char* argv[]) {
auto rc = int{};
auto args = Arguments{};
rc = initArguments(args);
if (rc < 0) {
logr.error("initArguments failed");
return -1;
}
rc = parseArguments(argc, argv, args);
if (rc < 0) {
/* usage printed */
@ -2282,7 +2351,7 @@ int main(int argc, char* argv[]) {
args.total_tenants = args.active_tenants;
}
rc = validateArguments(args);
rc = args.validate();
if (rc < 0)
return -1;
logr.setVerbosity(args.verbose);

View File

@ -30,6 +30,7 @@
#include <cassert>
#include <chrono>
#include <list>
#include <map>
#include <vector>
#include <string_view>
#include <fdb_api.hpp>
@ -79,7 +80,11 @@ enum ArgKind {
ARG_JSON_REPORT,
ARG_BG_FILE_PATH, // if blob granule files are stored locally, mako will read and materialize them if this is set
ARG_EXPORT_PATH,
ARG_DISTRIBUTED_TRACER_CLIENT
ARG_DISTRIBUTED_TRACER_CLIENT,
ARG_TLS_CERTIFICATE_FILE,
ARG_TLS_KEY_FILE,
ARG_TLS_CA_FILE,
ARG_AUTHORIZATION_TOKEN_FILE,
};
constexpr const int OP_COUNT = 0;
@ -131,6 +136,9 @@ constexpr const int MAX_REPORT_FILES = 200;
/* benchmark parameters */
struct Arguments {
Arguments();
int validate();
int api_version;
int json;
int num_processes;
@ -180,6 +188,10 @@ struct Arguments {
char report_files[MAX_REPORT_FILES][PATH_MAX];
int num_report_files;
int distributed_tracer_client;
std::optional<std::string> tls_certificate_file;
std::optional<std::string> tls_key_file;
std::optional<std::string> tls_ca_file;
std::map<std::string, std::string> authorization_tokens; // maps tenant name to token string
};
} // namespace mako

View File

@ -38,7 +38,7 @@ Arguments
| - ``build``: Populate data
| - ``run``: Run the benchmark
- | ``-c | --cluster <cluster file>``
- | ``-c | --cluster <cluster_file>``
| FDB cluster files (Required, comma-separated)
- | ``-d | --num_databases <num_databases>``
@ -125,9 +125,21 @@ Arguments
| Disable snapshot read-your-writes
- | ``--json_report`` defaults to ``mako.json``
| ``--json_report=PATH``
| ``--json_report <path>``
| Output stats to the specified json file
- | ``--tls_certificate_file <path>``
| Use TLS certificate located in ``<path>``
- | ``--tls_key_file <path>``
| Use TLS key file located in ``<path>``
- | ``--tls_ca_file <path>``
| Use TLS CA file located in ``<path>``
- | ``--authorization_token_file <path>``
| Use authorization token JSON file located in ``<path>``
| Expected content is a JSON object where each key is a tenant name and the mapped value is a token string
Transaction Specification
=========================

View File

@ -76,38 +76,11 @@ function(generate_coverage_xml)
add_dependencies(coverage_${target_name} coveragetool)
endfunction()
# This function asserts that `versions.h` does not exist in the source
# directory. It does this in the prebuild phase of the target.
# This is an ugly hack that should make sure that cmake isn't used with
# a source directory in which FDB was previously built with `make`.
function(assert_no_version_h target)
message(STATUS "Check versions.h on ${target}")
set(target_name "${target}_versions_h_check")
if (DEFINED ENV{VERBOSE})
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMAND echo
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-DFILE="${CMAKE_SOURCE_DIR}/versions.h"
COMMENT "Check old build system wasn't used in source dir")
else()
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMENT "Check old build system wasn't used in source dir")
endif()
add_dependencies(${target} ${target_name})
endfunction()
add_custom_target(strip_targets)
add_dependencies(packages strip_targets)
function(strip_debug_symbols target)
if (WIN32)
if(WIN32)
return()
endif()
get_target_property(target_type ${target} TYPE)
@ -146,7 +119,7 @@ function(strip_debug_symbols target)
COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
COMMENT "Copy debug symbols to ${out_name}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
else()
add_custom_target(strip_${target})
add_dependencies(strip_${target} strip_only_${target})
@ -171,7 +144,7 @@ function(copy_headers)
foreach(f IN LISTS CP_SRCS)
is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}")
is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}")
if (bd OR sd)
if(bd OR sd)
continue()
endif()
is_header(hdr "${f}")
@ -180,7 +153,7 @@ function(copy_headers)
endif()
get_filename_component(fname ${f} NAME)
get_filename_component(dname ${f} DIRECTORY)
if (dname)
if(dname)
make_directory(${incl_dir}/${dname})
endif()
set(fpath "${incl_dir}/${dname}/${fname}")
@ -309,9 +282,6 @@ function(add_flow_target)
add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files})
add_dependencies(${AFT_NAME} ${AFT_NAME}_actors)
if(NOT WIN32)
assert_no_version_h(${AFT_NAME}_actors)
endif()
generate_coverage_xml(${AFT_NAME})
if(strip_target)
strip_debug_symbols(${AFT_NAME})

View File

@ -8,40 +8,43 @@ endif()
include(ExternalProject)
ExternalProject_Add(awssdk_project
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
# it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt.
# This option forces to cmake to build the aws sdk only once and never attempt to update it
UPDATE_DISCONNECTED ON
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
add_library(awssdk_core STATIC IMPORTED)
add_dependencies(awssdk_core awssdk_project)

View File

@ -303,7 +303,6 @@ class TestRun:
self.stats: str | None = stats
self.expected_unseed: int | None = expected_unseed
self.use_valgrind: bool = config.use_valgrind
self.long_running: bool = config.long_running
self.old_binary_path: Path = config.old_binaries_path
self.buggify_enabled: bool = buggify_enabled
self.fault_injection_enabled: bool = True
@ -315,7 +314,7 @@ class TestRun:
# state for the run
self.retryable_error: bool = False
self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
will_restart=will_restart)
will_restart=will_restart, long_running=config.long_running)
self.run_time: int = 0
self.success = self.run()
@ -367,6 +366,11 @@ class TestRun:
command += ['-b', 'on']
if config.crash_on_error:
command.append('--crash')
if config.long_running:
# disable simulation speedup
command += ['--knob-sim-speedup-after-seconds=36000']
# disable traceTooManyLines Error MAX_TRACE_LINES
command += ['--knob-max-trace-lines=1000000000']
self.temp_path.mkdir(parents=True, exist_ok=True)
@ -376,7 +380,8 @@ class TestRun:
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
did_kill = False
timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
# No timeout for long running tests
timeout = 20 * config.kill_seconds if self.use_valgrind else (None if config.long_running else config.kill_seconds)
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)

View File

@ -159,13 +159,20 @@ class Parser:
pass
class XmlParser(Parser, xml.sax.handler.ContentHandler):
class XmlParser(Parser, xml.sax.handler.ContentHandler, xml.sax.handler.ErrorHandler):
def __init__(self):
super().__init__()
self.handler: ParseHandler | None = None
def parse(self, file: TextIO, handler: ParseHandler) -> None:
xml.sax.parse(file, self)
self.handler = handler
xml.sax.parse(file, self, errorHandler=self)
def error(self, exception):
pass
def fatalError(self, exception):
pass
def startElement(self, name, attrs) -> None:
attributes: Dict[str, str] = {}
@ -276,6 +283,7 @@ class TraceFiles:
raise StopIteration
self.current += 1
return self.trace_files[self.current - 1]
return TraceFilesIterator(self)
@ -283,11 +291,12 @@ class Summary:
def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None,
was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None,
exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None,
error_out: str = None, will_restart: bool = False):
error_out: str = None, will_restart: bool = False, long_running: bool = False):
self.binary = binary
self.runtime: float = runtime
self.max_rss: int | None = max_rss
self.was_killed: bool = was_killed
self.long_running = long_running
self.expected_unseed: int | None = expected_unseed
self.exit_code: int = exit_code
self.out: SummaryTree = SummaryTree('Test')
@ -388,6 +397,10 @@ class Summary:
if self.was_killed:
child = SummaryTree('ExternalTimeout')
child.attributes['Severity'] = '40'
if self.long_running:
# debugging info for long-running tests
child.attributes['LongRunning'] = '1'
child.attributes['Runtime'] = str(self.runtime)
self.out.append(child)
self.error = True
if self.max_rss is not None:
@ -426,7 +439,8 @@ class Summary:
lines = self.error_out.splitlines()
stderr_bytes = 0
for line in lines:
if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
if line.endswith(
"WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
# When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives.
continue
if line.endswith("Warning: unimplemented fcntl command: 1036"):
@ -560,6 +574,9 @@ class Summary:
self.handler.add_handler(('Severity', '30'), parse_warning)
def parse_error(attrs: Dict[str, str]):
if 'ErrorIsInjectedFault' in attrs and attrs['ErrorIsInjectedFault'].lower() in ['1', 'true']:
# ignore injected errors. In newer fdb versions these will have a lower severity
return
self.errors += 1
self.error = True
if self.errors > config.max_errors:
@ -606,6 +623,7 @@ class Summary:
child.attributes['File'] = attrs['File']
child.attributes['Line'] = attrs['Line']
self.out.append(child)
self.handler.add_handler(('Type', 'BuggifySection'), buggify_section)
self.handler.add_handler(('Type', 'FaultInjected'), buggify_section)
@ -614,9 +632,11 @@ class Summary:
child.attributes['Name'] = attrs['Name']
child.attributes['File'] = attrs['File']
child.attributes['Line'] = attrs['Line']
self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test)
def stderr_severity(attrs: Dict[str, str]):
if 'NewSeverity' in attrs:
self.stderr_severity = attrs['NewSeverity']
self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity)

View File

@ -55,6 +55,6 @@ if __name__ == '__main__':
summary.summarize_files(files)
summary.out.dump(sys.stdout)
else:
summary = Summary(Path('bin/fdbserver'), was_killed=True)
summary = Summary(Path('bin/fdbserver'), was_killed=True, long_running=config.long_running)
summary.summarize_files(files)
summary.out.dump(sys.stdout)

View File

@ -34,20 +34,25 @@ Commit proxies would combine idempotency IDs for transactions within a batch. Th
## Value format
```
${protocol_version}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
${protocol_version}${timestamp}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
```
The batch index for each idempotency id can be reconstructed from the high order byte and low order bytes stored in the key and value, respectively. This is necessary for an "unknown_committed" transaction to recover their full version stamp. Batch index is a `short int`, i.e. 2 bytes.
The timestamp is the unix epoch stored as a little-endian signed 64-bit integer.
# Cleaning up old idempotency ids
After learning the result of an attempt to commit a transaction with an
idempotency id, the client may inform the cluster that it's no longer interested
in that id and the cluster can reclaim the space used to store the idempotency
id. The happy-path reply to a CommitTransactionRequest will say which proxy this
request should be sent to, and all idempotency ids for a database key will be
sent to the same proxy so that it can clear the key once it receives all of
them. The first proxy will also periodically clean up the oldest idempotency ids, based on a policy determined by two knobs. One knob will control the minimum lifetime of an idempotency id (i.e. don't delete anything younger than 1 day), and the other will control the target byte size of the idempotency keys (e.g. keep 100 MB of idempotency keys around).
id. The commit proxy that committed a batch is responsible for cleaning all
idempotency kv pairs from that batch, so clients must tell that specific proxy
that they're done with the id. The first proxy will also periodically clean up
the oldest idempotency ids, based on a policy determined by two knobs. One knob
will control the minimum lifetime of an idempotency id (i.e. don't delete
anything younger than 1 day), and the other will control the target byte size of
the idempotency keys (e.g. keep 100 MB of idempotency keys around).
# Commit protocol

View File

@ -49,7 +49,7 @@ master_doc = 'index'
# General information about the project.
project = u'FoundationDB'
copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors'
copyright = u'2013-2022 Apple, Inc and the FoundationDB project authors'
# Load the version information from 'versions.target'
import xml.etree.ElementTree as ET

View File

@ -2365,6 +2365,7 @@ ACTOR Future<Void> runRestore(Database db,
KeyRef(addPrefix),
KeyRef(removePrefix),
LockDB::True,
UnlockDB::True,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,

View File

@ -83,7 +83,7 @@ BlobCipherMetrics::BlobCipherMetrics()
CounterSet(cc, "Backup"),
CounterSet(cc, "Test") }) {
specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc);
traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
}
std::string toString(BlobCipherMetrics::UsageType type) {

View File

@ -0,0 +1,109 @@
/*
* BlobMetadataUtils.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BlobMetadataUtils.h"
#include "fmt/format.h"
#include "flow/IRandom.h"
#include "flow/flow.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/S3BlobStore.h"
std::string buildPartitionPath(const std::string& url, const std::string& partition) {
ASSERT(!partition.empty());
ASSERT(partition.front() != '/');
ASSERT(partition.back() == '/');
StringRef u(url);
if (u.startsWith("file://"_sr)) {
ASSERT(u.endsWith("/"_sr));
return url + partition;
} else if (u.startsWith("blobstore://"_sr)) {
std::string resource;
std::string lastOpenError;
S3BlobStoreEndpoint::ParametersT backupParams;
std::string urlCopy = url;
Reference<S3BlobStoreEndpoint> bstore =
S3BlobStoreEndpoint::fromString(url, {}, &resource, &lastOpenError, &backupParams);
ASSERT(!resource.empty());
ASSERT(resource.back() != '/');
size_t resourceStart = url.find(resource);
ASSERT(resourceStart != std::string::npos);
return urlCopy.insert(resourceStart + resource.size(), "/" + partition);
} else {
// FIXME: support azure
throw backup_invalid_url();
}
}
// FIXME: make this (more) deterministic outside of simulation for FDBPerfKmsConnector
Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName) {
Standalone<BlobMetadataDetailsRef> metadata;
metadata.domainId = domainId;
metadata.arena().dependsOn(domainName.arena());
metadata.domainName = domainName;
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3);
int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
TraceEvent ev(SevDebug, "SimBlobMetadata");
ev.detail("DomainId", domainId).detail("TypeNum", type).detail("PartitionCount", partitionCount);
if (type == 0) {
// single storage location
std::string partition = std::to_string(domainId) + "/";
metadata.base = StringRef(metadata.arena(), buildPartitionPath(baseUrl, partition));
ev.detail("Base", metadata.base);
}
if (type == 1) {
// simulate hash prefixing in s3
metadata.base = StringRef(metadata.arena(), baseUrl);
ev.detail("Base", metadata.base);
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(metadata.arena(),
deterministicRandom()->randomUniqueID().shortString() + "-" +
std::to_string(domainId) + "/");
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
if (type == 2) {
// simulate separate storage location per partition
for (int i = 0; i < partitionCount; i++) {
std::string partition = std::to_string(domainId) + "_" + std::to_string(i) + "/";
metadata.partitions.push_back_deep(metadata.arena(), buildPartitionPath(baseUrl, partition));
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
// set random refresh + expire time
if (deterministicRandom()->coinflip()) {
metadata.refreshAt = now() + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
metadata.expireAt =
metadata.refreshAt + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
} else {
metadata.refreshAt = std::numeric_limits<double>::max();
metadata.expireAt = metadata.refreshAt;
}
return metadata;
}

View File

@ -273,6 +273,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
// busyness reporting
init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 );
@ -281,6 +282,7 @@ void ClientKnobs::initialize(Randomize randomize) {
// Blob granules
init( BG_MAX_GRANULE_PARALLELISM, 10 );
init( BG_TOO_MANY_GRANULES, 10000 );
init( BLOB_METADATA_REFRESH_INTERVAL, 3600 ); if ( randomize && BUGGIFY ) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 );
init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 );

View File

@ -22,6 +22,16 @@
#include "fdbclient/Knobs.h"
#include "fdbclient/NativeAPI.actor.h"
KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix) {
if (prefix.empty()) {
return range;
} else {
KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
return KeyRangeRef(begin, end);
}
}
KeyRef keyBetween(const KeyRangeRef& keys) {
int pos = 0; // will be the position of the first difference between keys.begin and keys.end
int minSize = std::min(keys.begin.size(), keys.end.size());

View File

@ -167,6 +167,7 @@ public:
KeyBackedProperty<Key> removePrefix() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); }
// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<std::vector<KeyRange>> restoreRanges() { return configSpace.pack(__FUNCTION__sr); }
@ -591,12 +592,11 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
ACTOR static Future<StringRef> decryptImpl(Database cx,
StringRef headerS,
BlobCipherEncryptHeader header,
const uint8_t* dataP,
int64_t dataLen,
Arena* arena) {
Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
@ -606,7 +606,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
static Future<StringRef> decrypt(Database cx,
StringRef headerS,
BlobCipherEncryptHeader headerS,
const uint8_t* dataP,
int64_t dataLen,
Arena* arena) {
@ -651,7 +651,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) {
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self));
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self->tenantCache));
state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;
// Get text and header cipher key
@ -693,12 +693,13 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; }
ACTOR static Future<std::pair<int64_t, TenantName>>
getEncryptionDomainDetailsImpl(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache, bool useTenantCache) {
ACTOR static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetailsImpl(
KeyRef key,
Reference<TenantEntryCache<Void>> tenantCache) {
if (isSystemKey(key)) {
return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) {
if (key.size() < TENANT_PREFIX_SIZE) {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
}
KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE);
@ -710,21 +711,10 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
}
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(KeyRef key,
EncryptedRangeFileWriter* self) {
// If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of
// unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may
// degrade if most of the mutations belong to an invalid tenant
TenantMode mode = self->cx->clientInfo->get().tenantMode;
bool useTenantCache = mode != TenantMode::DISABLED;
if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) {
// TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any
// tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode
// support with backups is more performant
useTenantCache = false;
}
CODE_PROBE(useTenantCache, "using tenant cache");
return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache);
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(
KeyRef key,
Reference<TenantEntryCache<Void>> tenantCache) {
return getEncryptionDomainDetailsImpl(key, tenantCache);
}
// Handles the first block and internal blocks. Ends current block if needed.
@ -816,6 +806,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE);
}
state ValueRef newValue = StringRef();
self->lastKey = k;
self->lastValue = v;
@ -834,9 +825,9 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
if (self->lastKey.size() == 0 || k.size() == 0) {
return false;
}
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self));
state std::pair<int64_t, TenantName> prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self));
// crossing tenant boundaries so finish the current block using only the tenant prefix of the new key
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache));
state std::pair<int64_t, TenantName> prevKeyTenantInfo =
wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache));
if (curKeyTenantInfo.first != prevKeyTenantInfo.first) {
CODE_PROBE(true, "crossed tenant boundaries");
wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo));
@ -1040,11 +1031,18 @@ private:
Key lastValue;
};
void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<Reference<TenantEntryCache<Void>>> tenantCache,
Optional<BlobCipherEncryptHeader> encryptHeader) {
// Read begin key, if this fails then block was invalid.
uint32_t kLen = reader->consumeNetworkUInt32();
const uint8_t* k = reader->consume(kLen);
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
state KeyRef prevKey = KeyRef(k, kLen);
state bool done = false;
state Optional<std::pair<int64_t, TenantName>> prevTenantInfo;
// Read kv pairs and end key
while (1) {
@ -1052,6 +1050,35 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
kLen = reader->consumeNetworkUInt32();
k = reader->consume(kLen);
// make sure that all keys in a block belong to exactly one tenant,
// unless its the last key in which case it can be a truncated (different) tenant prefix
if (encryptedBlock && g_network && g_network->isSimulated()) {
ASSERT(tenantCache.present());
ASSERT(encryptHeader.present());
state KeyRef curKey = KeyRef(k, kLen);
if (!prevTenantInfo.present()) {
std::pair<int64_t, TenantName> tenantInfo =
wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, tenantCache.get()));
prevTenantInfo = tenantInfo;
}
std::pair<int64_t, TenantName> curTenantInfo =
wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, tenantCache.get()));
if (!curKey.empty() && !prevKey.empty() && prevTenantInfo.get().first != curTenantInfo.first) {
ASSERT(!done);
if (curTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID &&
curTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(curKey.size() == TENANT_PREFIX_SIZE);
}
done = true;
}
// make sure that all keys (except possibly the last key) in a block are encrypted using the correct key
if (!prevKey.empty()) {
ASSERT(prevTenantInfo.get().first == encryptHeader.get().cipherTextDetails.encryptDomainId);
}
prevKey = curKey;
prevTenantInfo = curTenantInfo;
}
// If eof reached or first value len byte is 0xFF then a valid block end was reached.
if (reader->eof() || *reader->rptr == 0xFF) {
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
@ -1072,6 +1099,8 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
for (auto b : reader->remainder())
if (b != 0xFF)
throw restore_corrupted_data_padding();
return Void();
}
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1094,7 +1123,11 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
decodeKVPairs(&reader, &results);
wait(decodeKVPairs(&reader,
&results,
false,
Optional<Reference<TenantEntryCache<Void>>>(),
Optional<BlobCipherEncryptHeader>()));
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
@ -1108,7 +1141,8 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// read encryption header
const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
// calculate the total bytes read up to (and including) the header
int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
@ -1117,7 +1151,12 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
decodeKVPairs(&reader, &results);
state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
if (g_network && g_simulator->isSimulated()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx.get(), TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache.get()->init());
}
wait(decodeKVPairs(&reader, &results, true, tenantCache, header));
} else {
throw restore_unsupported_file_version();
}
@ -1711,7 +1750,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
state bool done = false;
state int64_t nrKeys = 0;
state bool encryptionEnabled = false;
state Optional<bool> encryptionEnabled;
loop {
state RangeResultWithVersion values;
@ -1777,7 +1816,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
wait(taskBucket->keepRunning(tr, task) &&
storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) &&
storeOrThrow(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
store(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr)));
break;
@ -1790,9 +1829,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize));
outFile = f;
encryptionEnabled = encryptionEnabled && cx->clientInfo->get().isEncryptionEnabled;
const bool encrypted =
encryptionEnabled.present() && encryptionEnabled.get() && cx->clientInfo->get().isEncryptionEnabled;
// Initialize range file writer and write begin key
if (encryptionEnabled) {
if (encrypted) {
CODE_PROBE(true, "using encrypted snapshot file writer");
if (!tenantCache.isValid()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
@ -3398,6 +3438,8 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
state RestoreConfig restore(task);
restore.stateEnum().set(tr, ERestoreState::COMPLETED);
state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true));
tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue);
// Clear the file map now since it could be huge.
restore.fileSet().clear(tr);
@ -3413,7 +3455,9 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
restore.clearApplyMutationsKeys(tr);
wait(taskBucket->finish(tr, task));
wait(unlockDatabase(tr, restore.getUid()));
if (unlockDB) {
wait(unlockDatabase(tr, restore.getUid()));
}
return Void();
}
@ -5172,6 +5216,7 @@ public:
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -5245,6 +5290,7 @@ public:
restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs);
restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
restore.beginVersion().set(tr, beginVersion);
restore.unlockDBAfterRestore().set(tr, unlockDB);
if (BUGGIFY && restoreRanges.size() == 1) {
restore.restoreRange().set(tr, restoreRanges[0]);
} else {
@ -5836,6 +5882,7 @@ public:
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -5892,6 +5939,7 @@ public:
addPrefix,
removePrefix,
lockDB,
unlockDB,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -6017,7 +6065,7 @@ public:
}
}
Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
state Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
if (fastRestore) {
TraceEvent("AtomicParallelRestoreStartRestore").log();
@ -6043,24 +6091,80 @@ public:
return -1;
} else {
TraceEvent("AS_StartRestore").log();
Version ver = wait(restore(backupAgent,
cx,
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
ranges,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid));
state Standalone<VectorRef<KeyRangeRef>> restoreRange;
state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
bool encryptionEnabled = cx->clientInfo->get().isEncryptionEnabled;
for (auto r : ranges) {
if (!encryptionEnabled || !r.intersects(getSystemBackupRanges())) {
restoreRange.push_back_deep(restoreRange.arena(), r);
} else {
KeyRangeRef normalKeyRange = r & normalKeys;
KeyRangeRef systemKeyRange = r & systemKeys;
if (!normalKeyRange.empty()) {
restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
}
if (!systemKeyRange.empty()) {
systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
}
}
}
if (!systemRestoreRange.empty()) {
// restore system keys
wait(success(restore(backupAgent,
cx,
cx,
"system_restore"_sr,
KeyRef(bc->getURL()),
bc->getProxy(),
systemRestoreRange,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
UnlockDB::False,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid)));
state Reference<ReadYourWritesTransaction> rywTransaction =
Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
// clear old restore config associated with system keys
loop {
try {
rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE);
state RestoreConfig oldRestore(randomUid);
oldRestore.clear(rywTransaction);
wait(rywTransaction->commit());
break;
} catch (Error& e) {
wait(rywTransaction->onError(e));
}
}
}
// restore user data
state Version ver = wait(restore(backupAgent,
cx,
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
restoreRange,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
UnlockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid));
return ver;
}
}
@ -6120,6 +6224,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -6137,6 +6242,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
unlockDB,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -6178,6 +6284,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
UnlockDB::True,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,

View File

@ -1,5 +1,5 @@
/*
* IdempotencyId.cpp
* IdempotencyId.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
@ -18,9 +18,11 @@
* limitations under the License.
*/
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // this has to be the last include
struct IdempotencyIdKVBuilderImpl {
Optional<Version> commitVersion;
@ -40,6 +42,7 @@ void IdempotencyIdKVBuilder::add(const IdempotencyIdRef& id, uint16_t batchIndex
ASSERT((batchIndex >> 8) == impl->batchIndexHighOrderByte.get());
} else {
impl->batchIndexHighOrderByte = batchIndex >> 8;
impl->value << int64_t(now());
}
StringRef s = id.asStringRefUnsafe();
impl->value << uint8_t(s.size());
@ -53,19 +56,17 @@ Optional<KeyValue> IdempotencyIdKVBuilder::buildAndClear() {
return {};
}
BinaryWriter key{ Unversioned() };
key.serializeBytes(idempotencyIdKeys.begin);
key << bigEndian64(impl->commitVersion.get());
key << impl->batchIndexHighOrderByte.get();
Value v = impl->value.toValue();
KeyRef key =
makeIdempotencySingleKeyRange(v.arena(), impl->commitVersion.get(), impl->batchIndexHighOrderByte.get()).begin;
impl->value = BinaryWriter(IncludeVersion());
impl->batchIndexHighOrderByte = Optional<uint8_t>();
Optional<KeyValue> result = KeyValue();
result.get().arena() = v.arena();
result.get().key = key.toValue(result.get().arena());
result.get().key = key;
result.get().value = v;
return result;
}
@ -86,6 +87,8 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
// Even if id is a substring of value, it may still not actually contain it.
BinaryReader reader(kv.value.begin(), kv.value.size(), IncludeVersion());
int64_t timestamp; // ignored
reader >> timestamp;
while (!reader.empty()) {
uint8_t length;
reader >> length;
@ -93,13 +96,9 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
uint8_t lowOrderBatchIndex;
reader >> lowOrderBatchIndex;
if (candidate == needle) {
BinaryReader reader(kv.key.begin(), kv.key.size(), Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
Version commitVersion;
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
uint8_t highOrderBatchIndex;
reader >> highOrderBatchIndex;
decodeIdempotencyKey(kv.key, commitVersion, highOrderBatchIndex);
return CommitResult{ commitVersion,
static_cast<uint16_t>((uint16_t(highOrderBatchIndex) << 8) |
uint16_t(lowOrderBatchIndex)) };
@ -172,4 +171,35 @@ TEST_CASE("/fdbclient/IdempotencyId/serialization") {
ASSERT(t == id);
}
return Void();
}
KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex) {
static const auto size =
idempotencyIdKeys.begin.size() + sizeof(version) + sizeof(highOrderBatchIndex) + /*\x00*/ 1;
StringRef second = makeString(size, arena);
auto* dst = mutateString(second);
memcpy(dst, idempotencyIdKeys.begin.begin(), idempotencyIdKeys.begin.size());
dst += idempotencyIdKeys.begin.size();
version = bigEndian64(version);
memcpy(dst, &version, sizeof(version));
dst += sizeof(version);
*dst++ = highOrderBatchIndex;
*dst++ = 0;
ASSERT_EQ(dst - second.begin(), size);
return KeyRangeRef(second.removeSuffix("\x00"_sr), second);
}
void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex) {
BinaryReader reader(key, Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
reader >> highOrderBatchIndex;
}

View File

@ -2639,7 +2639,8 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") {
ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource),
"",
"",
currentProtocolVersion());
currentProtocolVersion(),
false);
}
workers.push_back(data);

View File

@ -1888,6 +1888,9 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
TraceEvent("UnknownDatabaseOption").detail("Option", option);
throw invalid_option();
}
if (itr->first == FDBDatabaseOptions::USE_CONFIG_DATABASE) {
dbState->isConfigDB = true;
}
int defaultFor = itr->second.defaultFor;
if (defaultFor >= 0) {
@ -1994,7 +1997,7 @@ ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<P
MultiVersionDatabase::DatabaseState::DatabaseState(ClusterConnectionRecord const& connectionRecord,
Reference<IDatabase> versionMonitorDb)
: dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))),
connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false) {}
connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false), isConfigDB(false) {}
// Adds a client (local or externally loaded) that can be used to connect to the cluster
void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
@ -2192,8 +2195,12 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
.detail("ConnectionRecord", connectionRecord);
}
}
// Verify the database has the necessary functionality to update the shared
// state. Avoid updating the shared state if the database is a
// configuration database, because a configuration database does not have
// access to typical system keys and does not need to be updated.
if (db.isValid() && dbProtocolVersion.present() &&
MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) {
MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap() && !isConfigDB) {
Future<std::string> updateResult =
MultiVersionApi::api->updateClusterSharedStateMap(connectionRecord, dbProtocolVersion.get(), db);
sharedStateUpdater = map(errorOr(updateResult), [this](ErrorOr<std::string> result) {

View File

@ -1479,16 +1479,6 @@ Future<RangeResult> HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction*
return healthMetricsGetRangeActor(ryw, kr);
}
KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) {
if (prefix.empty()) {
return range;
} else {
KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
return KeyRangeRef(begin, end);
}
}
ACTOR Future<UID> getClusterId(Database db) {
while (!db->clientInfo->get().clusterId.isValid()) {
wait(db->clientInfo->onChange());
@ -1925,7 +1915,8 @@ Optional<KeyRangeLocationInfo> DatabaseContext::getCachedLocation(const Optional
auto range =
isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey);
if (range->value()) {
return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value());
return KeyRangeLocationInfo(
tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value());
}
return Optional<KeyRangeLocationInfo>();
@ -1962,7 +1953,8 @@ bool DatabaseContext::getCachedLocations(const Optional<TenantNameRef>& tenantNa
result.clear();
return false;
}
result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
result.emplace_back(
tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
if (result.size() == limit || begin == end) {
break;
}
@ -2978,7 +2970,7 @@ ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,
return KeyRangeLocationInfo(
rep.tenantEntry,
KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
locationInfo);
}
}
@ -3123,7 +3115,7 @@ ACTOR Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations_internal(
// efficient to save the map pairs and insert them all at once.
results.emplace_back(
rep.tenantEntry,
(toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
(toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
cx->setCachedLocation(
tenant.name, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second));
wait(yield());
@ -4025,6 +4017,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
req.version = version;
req.begin = firstGreaterOrEqual(range.begin);
req.end = firstGreaterOrEqual(range.end);
setMatchIndex<GetKeyValuesFamilyRequest>(req, matchIndex);
req.spanContext = span.context;
trState->cx->getLatestCommitVersions(
@ -6158,6 +6151,7 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
IdempotencyIdRef idempotencyId) {
state Transaction tr(trState->cx);
state int retries = 0;
state Version expiredVersion;
state Span span("NAPI:determineCommitStatus"_loc, trState->spanContext);
tr.span.setParent(span.context);
loop {
@ -6167,11 +6161,19 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
tr.trState->authToken = trState->authToken;
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
KeyBackedObjectProperty<IdempotencyIdsExpiredVersion, _Unversioned> expiredKey(idempotencyIdsExpiredVersion,
Unversioned());
IdempotencyIdsExpiredVersion expiredVal = wait(expiredKey.getD(&tr));
expiredVersion = expiredVal.expired;
if (expiredVersion >= minPossibleCommitVersion) {
throw commit_unknown_result_fatal();
}
Version rv = wait(tr.getReadVersion());
TraceEvent("DetermineCommitStatusAttempt")
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries)
.detail("ReadVersion", rv)
.detail("ExpiredVersion", expiredVersion)
.detail("MinPossibleCommitVersion", minPossibleCommitVersion)
.detail("MaxPossibleCommitVersion", maxPossibleCommitVersion);
KeyRange possibleRange =
@ -6415,6 +6417,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
req.debugID = commitID;
state Future<CommitID> reply;
// Only gets filled in in the happy path where we don't have to commit on the first proxy or use provisional
// proxies
state int alternativeChosen = -1;
// Only valid if alternativeChosen >= 0
state Reference<CommitProxyInfo> proxiesUsed;
if (trState->options.commitOnFirstProxy) {
if (trState->cx->clientInfo->get().firstCommitProxy.present()) {
reply = throwErrorOr(brokenPromiseToMaybeDelivered(
@ -6425,11 +6433,13 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
: Never();
}
} else {
reply = basicLoadBalance(trState->cx->getCommitProxies(trState->useProvisionalProxies),
proxiesUsed = trState->cx->getCommitProxies(trState->useProvisionalProxies);
reply = basicLoadBalance(proxiesUsed,
&CommitProxyInterface::commit,
req,
TaskPriority::DefaultPromiseEndpoint,
AtMostOnce::True);
AtMostOnce::True,
&alternativeChosen);
}
state double grvTime = now();
choose {
@ -6479,6 +6489,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
ci.version,
req,
trState->tenant()));
if (trState->automaticIdempotency && alternativeChosen >= 0) {
// Automatic idempotency means we're responsible for best effort idempotency id clean up
proxiesUsed->getInterface(alternativeChosen)
.expireIdempotencyId.send(ExpireIdempotencyIdRequest{
ci.version, uint8_t(ci.txnBatchId >> 8), trState->getTenantInfo() });
}
return Void();
} else {
// clear the RYW transaction which contains previous conflicting keys
@ -6566,7 +6582,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
e.code() != error_code_grv_proxy_memory_limit_exceeded &&
e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled &&
e.code() != error_code_process_behind && e.code() != error_code_future_version &&
e.code() != error_code_tenant_not_found) {
e.code() != error_code_tenant_not_found && e.code() != error_code_proxy_tag_throttled) {
TraceEvent(SevError, "TryCommitError").error(e);
}
if (trState->trLogInfo)
@ -6964,11 +6980,16 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
throw e;
}
tr.idempotencyId = IdempotencyIdRef(tr.arena, IdempotencyIdRef(value.get()));
trState->automaticIdempotency = false;
break;
case FDBTransactionOptions::AUTOMATIC_IDEMPOTENCY:
validateOptionValueNotPresent(value);
tr.idempotencyId = IdempotencyIdRef(
tr.arena, IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
if (!tr.idempotencyId.valid()) {
tr.idempotencyId = IdempotencyIdRef(
tr.arena,
IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
}
trState->automaticIdempotency = true;
break;
default:
@ -7007,6 +7028,8 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
&GrvProxyInterface::getConsistentReadVersion,
req,
cx->taskID))) {
CODE_PROBE(v.proxyTagThrottledDuration > 0.0,
"getConsistentReadVersion received GetReadVersionReply delayed by proxy tag throttling");
if (tags.size() != 0) {
auto& priorityThrottledTags = cx->throttledTags[priority];
for (auto& tag : tags) {
@ -7041,7 +7064,7 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
}
} catch (Error& e) {
if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled &&
e.code() != error_code_grv_proxy_memory_limit_exceeded)
e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_proxy_tag_throttled)
TraceEvent(SevError, "GetConsistentReadVersionError").error(e);
if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) {
wait(delayJittered(5.0));
@ -7492,7 +7515,7 @@ Future<Void> Transaction::onError(Error const& e) {
e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded ||
e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind ||
e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled ||
e.code() == error_code_blob_granule_request_failed) {
e.code() == error_code_blob_granule_request_failed || e.code() == error_code_proxy_tag_throttled) {
if (e.code() == error_code_not_committed)
++trState->cx->transactionsNotCommitted;
else if (e.code() == error_code_commit_unknown_result)
@ -7732,6 +7755,35 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
}
}
ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
KeyRange keys,
std::vector<KeyRangeLocationInfo> locations,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError) {
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return x;
} catch (Error& e) {
TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
}
return Optional<StorageMetrics>();
}
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
Database cx,
KeyRange keys,
@ -7761,38 +7813,26 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
}
// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
// solution to this.
if (locations.size() < shardLimit) {
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return std::make_pair(x, -1);
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
} else {
// solution to this. How could this happen?
if (locations.size() >= shardLimit) {
TraceEvent(SevWarn, "WaitStorageMetricsPenalty")
.detail("Keys", keys)
.detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)
.detail("Limit", shardLimit)
.detail("LocationSize", locations.size())
.detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
continue;
}
Optional<StorageMetrics> res =
wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
}
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
@ -8653,6 +8693,56 @@ Future<Void> DatabaseContext::splitStorageMetricsStream(const PromiseStream<Key>
resultStream, Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated, minSplitBytes);
}
ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
std::vector<KeyRangeLocationInfo> locations,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated,
Optional<int> minSplitBytes) {
state StorageMetrics used;
state Standalone<VectorRef<KeyRef>> results;
results.push_back_deep(results.arena(), keys.begin);
//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
try {
state int i = 0;
for (; i < locations.size(); i++) {
SplitMetricsRequest req(
locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
TaskPriority::DataDistribution));
if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly
// because of moving data, throw error to retry
ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}
if (res.splits.size()) {
results.append(results.arena(), res.splits.begin(), res.splits.size());
results.arena().dependsOn(res.splits.arena());
}
used = res.used;
//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
}
if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) {
results.resize(results.arena(), results.size() - 1);
}
if (keys.end <= locations.back().range.end) {
results.push_back_deep(results.arena(), keys.end);
}
return results;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "SplitStorageMetricsError").error(e);
throw;
}
}
return Optional<Standalone<VectorRef<KeyRef>>>();
}
ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
KeyRange keys,
StorageMetrics limit,
@ -8671,61 +8761,24 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
Optional<UID>(),
UseProvisionalProxies::False,
latestVersion));
state StorageMetrics used;
state Standalone<VectorRef<KeyRef>> results;
// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
// solution to this.
if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(Key(), keys);
} else {
results.push_back_deep(results.arena(), keys.begin);
try {
//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
state int i = 0;
for (; i < locations.size(); i++) {
SplitMetricsRequest req(
locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
TaskPriority::DataDistribution));
if (res.splits.size() &&
res.splits[0] <= results.back()) { // split points are out of order, possibly because of
// moving data, throw error to retry
ASSERT_WE_THINK(
false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}
if (res.splits.size()) {
results.append(results.arena(), res.splits.begin(), res.splits.size());
results.arena().dependsOn(res.splits.arena());
}
used = res.used;
//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
}
if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) &&
results.size() > 1) {
results.resize(results.arena(), results.size() - 1);
}
if (keys.end <= locations.back().range.end) {
results.push_back_deep(results.arena(), keys.end);
}
return results;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "SplitStorageMetricsError").error(e);
throw;
}
cx->invalidateCache(Key(), keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
continue;
}
Optional<Standalone<VectorRef<KeyRef>>> results =
wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
if (results.present()) {
return results.get();
}
cx->invalidateCache(Key(), keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
@ -9312,7 +9365,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
mismatchEvent.detail("EndKey", request.range.end);
mismatchEvent.detail("CanReadPopped", request.canReadPopped);
mismatchEvent.detail("PopVersion", popVersion);
mismatchEvent.detail("DebugUID", request.debugUID);
mismatchEvent.detail("DebugUID", request.id);
// mismatch info
mismatchEvent.detail("MatchesFound", matchesFound);
@ -9338,7 +9391,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
"TSSMismatchChangeFeedStream");
summaryEvent.detail("TSSID", tssData.tssId)
.detail("MismatchId", mismatchUID)
.detail("FeedDebugUID", request.debugUID);
.detail("FeedDebugUID", request.id);
}
}
}
@ -9863,7 +9916,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
Version* begin,
Version end,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state std::vector<Future<Void>> fetchers(interfs.size());
state std::vector<Future<Void>> onErrors(interfs.size());
state std::vector<MutationAndVersionStream> streams(interfs.size());
@ -9891,10 +9945,11 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
if (replyBufferSize != -1 && req.replyBufferSize < CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES) {
req.replyBufferSize = CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES;
}
req.debugUID = deterministicRandom()->randomUniqueID();
debugUIDs.push_back(req.debugUID);
mergeCursorUID =
UID(mergeCursorUID.first() ^ req.debugUID.first(), mergeCursorUID.second() ^ req.debugUID.second());
req.options = readOptions;
req.id = deterministicRandom()->randomUniqueID();
debugUIDs.push_back(req.id);
mergeCursorUID = UID(mergeCursorUID.first() ^ req.id.first(), mergeCursorUID.second() ^ req.id.second());
results->streams.push_back(interfs[i].first.changeFeedStream.getReplyStream(req));
maybeDuplicateTSSChangeFeedStream(req,
@ -10097,7 +10152,8 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
Version* begin,
Version end,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state Database cx(db);
state ChangeFeedStreamRequest req;
state Optional<ChangeFeedTSSValidationData> tssData;
@ -10107,10 +10163,11 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
req.range = range;
req.canReadPopped = canReadPopped;
req.replyBufferSize = replyBufferSize;
req.debugUID = deterministicRandom()->randomUniqueID();
req.options = readOptions;
req.id = deterministicRandom()->randomUniqueID();
if (DEBUG_CF_CLIENT_TRACE) {
TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.debugUID)
TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.id)
.detail("FeedID", rangeID)
.detail("Range", range)
.detail("Begin", *begin)
@ -10150,7 +10207,8 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
Version end,
KeyRange range,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state Database cx(db);
state Span span("NAPI:GetChangeFeedStream"_loc);
db->usedAnyChangeFeeds = true;
@ -10240,14 +10298,22 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
}
CODE_PROBE(true, "Change feed merge cursor");
// TODO (jslocum): validate connectionFileChanged behavior
wait(
mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
cx->connectionFileChanged());
wait(mergeChangeFeedStream(
db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped, readOptions) ||
cx->connectionFileChanged());
} else {
CODE_PROBE(true, "Change feed single cursor");
StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]);
wait(singleChangeFeedStream(
db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
wait(singleChangeFeedStream(db,
interf,
range,
results,
rangeID,
&begin,
end,
replyBufferSize,
canReadPopped,
readOptions) ||
cx->connectionFileChanged());
}
} catch (Error& e) {
@ -10314,9 +10380,17 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
Version end,
KeyRange range,
int replyBufferSize,
bool canReadPopped) {
return getChangeFeedStreamActor(
Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
bool canReadPopped,
ReadOptions readOptions) {
return getChangeFeedStreamActor(Reference<DatabaseContext>::addRef(this),
results,
rangeID,
begin,
end,
range,
replyBufferSize,
canReadPopped,
readOptions);
}
Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
@ -10548,6 +10622,76 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
}
// BlobGranule API.
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) {
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state RangeResult results = wait(
krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
}
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobbifiedRanges(Transaction* tr,
KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName) {
state TenantMapEntry tme;
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
}
ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
KeyRange range,
Version purgeVersion,
@ -10590,10 +10734,13 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
}
// must be aligned to blob range(s)
state Future<Optional<Value>> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin));
state Future<Optional<Value>> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin));
wait(success(beginPresent) && success(endPresent));
if (!beginPresent.get().present() || !endPresent.get().present()) {
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {});
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {});
wait(success(blobbifiedBegin) && success(blobbifiedEnd));
if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
(!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
TraceEvent("UnalignedPurge")
.detail("Range", range)
.detail("Version", purgeVersion)
@ -10670,39 +10817,6 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYourWritesTransaction> tr,
KeyRange range,
int batchLimit) {
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state RangeResult results = wait(
krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
}
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
KeyRange range,
bool active,
@ -10724,7 +10838,7 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
range = range.withPrefix(tenantEntry.prefix);
}
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 1));
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));
if (active) {
// Idempotent request.
@ -10772,47 +10886,19 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state TenantMapEntry tme;
state Transaction tr(db);
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
Standalone<VectorRef<KeyRangeRef>> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName));
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
return blobbifiedRanges;
}
Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
int rowLimit,
int rangeLimit,
Optional<TenantName> tenantName) {
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit, tenantName);
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
}
int64_t getMaxKeySize(KeyRef const& key) {

View File

@ -422,10 +422,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Enable this knob only for experminatal purpose, never enable this in production.
// If enabled, all the committed in-memory memtable writes are lost on a crash.
init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false );
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS knob.
// These knobs have contrary functionality.
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
init( ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -727,8 +728,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0;
init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false;
init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10);
init( GLOBAL_TAG_THROTTLING, false );
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false );
init( GLOBAL_TAG_THROTTLING, false ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip();
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING );
init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 );
init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 );
init( GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED, 10 );
@ -761,7 +762,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 );
init( SERVE_AUDIT_STORAGE_PARALLELISM, 2 );
init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20;
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
@ -800,6 +800,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" );
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -911,7 +915,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_DEFAULT_EXTENT_SIZE, 32 * 1024 * 1024 );
init( REDWOOD_DEFAULT_EXTENT_READ_SIZE, 1024 * 1024 );
init( REDWOOD_EXTENT_CONCURRENT_READS, 4 );
init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 );
init( REDWOOD_KVSTORE_RANGE_PREFETCH, true );
init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 );
init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 );
@ -924,6 +927,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement
@ -966,6 +970,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
init( BG_KEY_TUPLE_TRUNCATE_OFFSET, 0 );
init( BG_ENABLE_READ_DRIVEN_COMPACTION, true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false;
init( BG_RDC_BYTES_FACTOR, 2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_RDC_READ_FACTOR, 3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
@ -974,6 +981,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
init( BLOB_WORKER_RDC_PARALLELISM, 2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 );
@ -996,8 +1005,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Blob Metadata
init( BLOB_METADATA_CACHE_TTL, isSimulated ? 120 : 24 * 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_CACHE_TTL = deterministicRandom()->randomInt(50, 100); }
init( BLOB_METADATA_REFRESH_INTERVAL, isSimulated ? 60 : 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
// HTTP KMS Connector
init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE, "file");
@ -1018,6 +1025,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// NOTE: 'token-name" can NOT contain '#' character
init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS, "");
// Drop in-memory state associated with an idempotency id after this many seconds. Once dropped, this id cannot be
// expired proactively, but will eventually get cleaned up by the idempotency id cleaner.
init( IDEMPOTENCY_ID_IN_MEMORY_LIFETIME, 10);
// clang-format on
if (clientKnobs) {

View File

@ -284,8 +284,6 @@ const KeyRangeRef readConflictRangeKeysRange =
const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transaction/write_conflict_range/"_sr,
"\xff\xff/transaction/write_conflict_range/\xff\xff"_sr);
const KeyRef clusterIdKey = "\xff/clusterId"_sr;
const KeyRangeRef auditRange = KeyRangeRef("\xff/audit/"_sr, "\xff/audit0"_sr);
const KeyRef auditPrefix = auditRange.begin;
@ -1074,6 +1072,11 @@ const KeyRangeRef timeKeeperPrefixRange("\xff\x02/timeKeeper/map/"_sr, "\xff\x02
const KeyRef timeKeeperVersionKey = "\xff\x02/timeKeeper/version"_sr;
const KeyRef timeKeeperDisableKey = "\xff\x02/timeKeeper/disable"_sr;
// Durable cluster ID key. Added "Key" to the end to differentiate from the key
// "\xff/clusterId" which was stored in the txnStateStore in FDB 7.1, whereas
// this key is stored in the database in 7.2+.
const KeyRef clusterIdKey = "\xff/clusterIdKey"_sr;
// Backup Log Mutation constant variables
const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr;
const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr);
@ -1810,4 +1813,4 @@ TEST_CASE("noSim/SystemData/compat/KeyServers") {
printf("ssi serdes test complete\n");
return Void();
}
}

View File

@ -579,8 +579,8 @@ public:
int maxConcurrentTasks) {
state Reference<AsyncVar<bool>> paused = makeReference<AsyncVar<bool>>(true);
state Future<Void> watchPausedFuture = watchPaused(cx, taskBucket, paused);
taskBucket->metricLogger = traceCounters(
"TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY, &taskBucket->cc);
taskBucket->metricLogger = taskBucket->cc.traceCounters(
"TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY);
loop {
while (paused->get()) {
wait(paused->onChange() || watchPausedFuture);

View File

@ -196,6 +196,7 @@ public:
Key addPrefix = Key(),
Key removePrefix = Key(),
LockDB = LockDB::True,
UnlockDB = UnlockDB::True,
OnlyApplyMutationLogs = OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
Version beginVersion = ::invalidVersion,

View File

@ -91,4 +91,8 @@ struct BlobMetadataDetailsRef {
}
};
Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName);
#endif

View File

@ -45,6 +45,7 @@ struct BlobWorkerStats {
Counter compressionBytesFinal;
Counter fullRejections;
Counter forceFlushCleanups;
Counter readDrivenCompactions;
int numRangesAssigned;
int mutationBytesBuffered;
@ -83,10 +84,11 @@ struct BlobWorkerStats {
readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0),
activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0),
notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
deltaWritesLock(deltaWritesLock) {
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
@ -103,8 +105,8 @@ struct BlobWorkerStats {
specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });
logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics");
logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics");
}
};
#endif
#endif

View File

@ -22,7 +22,7 @@
#define FDBCLIENT_BUILD_IDEMPOTENCY_ID_MUTATIONS_H
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#pragma once

View File

@ -264,6 +264,8 @@ public:
int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
// Cost multiplier for writes (because write operations are more expensive than reads):
double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
// being rejected
// busyness reporting
double BUSYNESS_SPIKE_START_THRESHOLD;
@ -272,6 +274,7 @@ public:
// Blob Granules
int BG_MAX_GRANULE_PARALLELISM;
int BG_TOO_MANY_GRANULES;
int64_t BLOB_METADATA_REFRESH_INTERVAL;
// The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file.
// This might happen when a recovery is happening together with a cluster controller coordinator key change.

View File

@ -30,7 +30,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GlobalConfig.h"
#include "fdbclient/GrvProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/VersionVector.h"
@ -61,6 +61,7 @@ struct CommitProxyInterface {
RequestStream<struct ProxySnapRequest> proxySnapReq;
RequestStream<struct ExclusionSafetyCheckRequest> exclusionSafetyCheckReq;
RequestStream<struct GetDDMetricsRequest> getDDMetrics;
PublicRequestStream<struct ExpireIdempotencyIdRequest> expireIdempotencyId;
UID id() const { return commit.getEndpoint().token; }
std::string toString() const { return id().shortString(); }
@ -87,6 +88,8 @@ struct CommitProxyInterface {
exclusionSafetyCheckReq =
RequestStream<struct ExclusionSafetyCheckRequest>(commit.getEndpoint().getAdjustedEndpoint(8));
getDDMetrics = RequestStream<struct GetDDMetricsRequest>(commit.getEndpoint().getAdjustedEndpoint(9));
expireIdempotencyId =
PublicRequestStream<struct ExpireIdempotencyIdRequest>(commit.getEndpoint().getAdjustedEndpoint(10));
}
}
@ -103,6 +106,7 @@ struct CommitProxyInterface {
streams.push_back(proxySnapReq.getReceiver());
streams.push_back(exclusionSafetyCheckReq.getReceiver());
streams.push_back(getDDMetrics.getReceiver());
streams.push_back(expireIdempotencyId.getReceiver());
FlowTransport::transport().addEndpoints(streams);
}
};
@ -151,6 +155,24 @@ struct ClientDBInfo {
}
};
struct ExpireIdempotencyIdRequest {
constexpr static FileIdentifier file_identifier = 1900933;
Version commitVersion = invalidVersion;
uint8_t batchIndexHighByte = 0;
TenantInfo tenant;
ExpireIdempotencyIdRequest() {}
ExpireIdempotencyIdRequest(Version commitVersion, uint8_t batchIndexHighByte, TenantInfo tenant)
: commitVersion(commitVersion), batchIndexHighByte(batchIndexHighByte), tenant(tenant) {}
bool verify() const { return tenant.isAuthorized(); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, commitVersion, batchIndexHighByte, tenant);
}
};
struct CommitID {
constexpr static FileIdentifier file_identifier = 14254927;
Version version; // returns invalidVersion if transaction conflicts

View File

@ -382,7 +382,8 @@ public:
Version end = std::numeric_limits<Version>::max(),
KeyRange range = allKeys,
int replyBufferSize = -1,
bool canReadPopped = true);
bool canReadPopped = true,
ReadOptions readOptions = { ReadType::NORMAL, CacheResult::False });
Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);

View File

@ -590,6 +590,8 @@ inline KeyRange prefixRange(KeyRef prefix) {
// The returned reference is valid as long as keys is valid.
KeyRef keyBetween(const KeyRangeRef& keys);
KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix);
struct KeySelectorRef {
private:
KeyRef key; // Find the last item less than key
@ -1633,13 +1635,7 @@ struct StorageWiggleValue {
}
};
enum class ReadType {
EAGER,
FETCH,
LOW,
NORMAL,
HIGH,
};
enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH };
FDB_DECLARE_BOOLEAN_PARAM(CacheResult);
@ -1655,14 +1651,14 @@ struct ReadOptions {
Optional<UID> debugID;
Optional<Version> consistencyCheckStartVersion;
ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){};
ReadOptions(Optional<UID> debugID,
ReadOptions(Optional<UID> debugID = Optional<UID>(),
ReadType type = ReadType::NORMAL,
CacheResult cache = CacheResult::False,
CacheResult cache = CacheResult::True,
Optional<Version> version = Optional<Version>())
: type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){};
ReadOptions(ReadType type, CacheResult cache = CacheResult::True) : ReadOptions({}, type, cache) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion);

View File

@ -1,5 +1,5 @@
/*
* IdempotencyId.h
* IdempotencyId.actor.h
*
* This source file is part of the FoundationDB open source project
*
@ -18,8 +18,13 @@
* limitations under the License.
*/
#ifndef FDBCLIENT_IDEMPOTENCYID_H
#define FDBCLIENT_IDEMPOTENCYID_H
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version.
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H)
#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H
#include "fdbclient/IdempotencyId.actor.g.h"
#elif !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H)
#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H
#pragma once
@ -28,12 +33,24 @@
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/serialize.h"
#include "flow/actorcompiler.h" // this has to be the last include
struct CommitResult {
Version commitVersion;
uint16_t batchIndex;
};
// The type of the value stored at the key |idempotencyIdsExpiredVersion|
struct IdempotencyIdsExpiredVersion {
static constexpr auto file_identifier = 3746945;
Version expired = 0;
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, expired);
}
};
// See design/idempotency_ids.md for more information. Designed so that the common case of a random 16 byte id does not
// usually require indirection. Either invalid or an id with length >= 16 and < 256.
struct IdempotencyIdRef {
@ -163,4 +180,10 @@ private:
// Check if id is present in kv, and if so return the commit version and batchIndex
Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const IdempotencyIdRef& id);
#endif
// Make a range containing only the idempotency key associated with version and highOrderBatchIndex
KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex);
void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex);
#include "flow/unactorcompiler.h"
#endif

View File

@ -0,0 +1,48 @@
/*
* KeyLocationService.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FOUNDATIONDB_KEYLOCATIONSERVICE_H
#define FOUNDATIONDB_KEYLOCATIONSERVICE_H
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/DatabaseContext.h"
class IKeyLocationService {
// If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
// Otherwise returns the shard containing key. It's possible the returned location is a failed interface.
virtual Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
Key key,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Reverse isBackward,
Version version) = 0;
virtual Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
KeyRange keys,
int limit,
Reverse reverse,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Version version) = 0;
};
#endif // FOUNDATIONDB_KEYLOCATIONSERVICE_H

View File

@ -1027,6 +1027,7 @@ public:
ThreadFuture<Void> protocolVersionMonitor;
Future<Void> sharedStateUpdater;
bool isConfigDB;
// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
// there are various issues that result in negative behavior in some cases if the connections are closed.

View File

@ -271,6 +271,8 @@ struct TransactionState : ReferenceCounted<TransactionState> {
// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;
bool automaticIdempotency = false;
// Only available so that Transaction can have a default constructor, for use in state variables
TransactionState(TaskPriority taskID, SpanContext spanContext)
: taskID(taskID), spanContext(spanContext), tenantSet(false) {}
@ -487,6 +489,7 @@ public:
Database getDatabase() const { return trState->cx; }
static Reference<TransactionLogInfo> createTrLogInfoProbabilistically(const Database& cx);
Transaction& getTransaction() { return *this; }
void setTransactionID(UID id);
void setToken(uint64_t token);
@ -603,6 +606,26 @@ int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess);
// Returns the maximum legal size of a key that can be cleared. Keys larger than this will be assumed not to exist.
int64_t getMaxClearKeySize(KeyRef const& key);
struct KeyRangeLocationInfo;
// Return the aggregated StorageMetrics of range keys to the caller. The locations tell which interface should
// serve the request. The final result is within (min-permittedError/2, max + permittedError/2) if valid.
ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
KeyRange keys,
std::vector<KeyRangeLocationInfo> locations,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError);
// Return the suggested split points from storage server.The locations tell which interface should
// serve the request. `limit` is the current estimated storage metrics of `keys`.The returned points, if present,
// guarantee the metrics of split result is within limit.
ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
std::vector<KeyRangeLocationInfo> locations,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated,
Optional<int> minSplitBytes);
namespace NativeAPI {
ACTOR Future<std::vector<std::pair<StorageServerInterface, ProcessClass>>> getServerListAndProcessClasses(
Transaction* tr);

View File

@ -349,6 +349,7 @@ public:
bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
bool ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
int64_t ROCKSDB_BLOCK_SIZE;
bool ENABLE_SHARDED_ROCKSDB;
@ -715,7 +716,6 @@ public:
int FETCH_KEYS_LOWER_PRIORITY;
int SERVE_FETCH_CHECKPOINT_PARALLELISM;
int SERVE_AUDIT_STORAGE_PARALLELISM;
int CHANGE_FEED_DISK_READS_PARALLELISM;
int BUGGIFY_BLOCK_BYTES;
int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
@ -754,6 +754,9 @@ public:
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -883,7 +886,6 @@ public:
int REDWOOD_DEFAULT_EXTENT_SIZE; // Extent size for new Redwood files
int REDWOOD_DEFAULT_EXTENT_READ_SIZE; // Extent read size for Redwood files
int REDWOOD_EXTENT_CONCURRENT_READS; // Max number of simultaneous extent disk reads in progress.
int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
bool REDWOOD_KVSTORE_RANGE_PREFETCH; // Whether to use range read prefetching
double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page
int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at
@ -903,6 +905,8 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS;
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;
double LATENCY_METRICS_LOGGING_INTERVAL;
@ -947,10 +951,14 @@ public:
int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
int BG_MERGE_CANDIDATE_DELAY_SECONDS;
int BG_KEY_TUPLE_TRUNCATE_OFFSET;
bool BG_ENABLE_READ_DRIVEN_COMPACTION;
int BG_RDC_BYTES_FACTOR;
int BG_RDC_READ_FACTOR;
int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
int BLOB_WORKER_RDC_PARALLELISM;
double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
@ -972,7 +980,6 @@ public:
// Blob metadata
int64_t BLOB_METADATA_CACHE_TTL;
int64_t BLOB_METADATA_REFRESH_INTERVAL;
// HTTP KMS Connector
std::string REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE;
@ -986,6 +993,9 @@ public:
std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT;
std::string REST_KMS_CONNECTOR_GET_BLOB_METADATA_ENDPOINT;
// Idempotency ids
double IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
void initialize(Randomize, ClientKnobs*, IsSimulated);
};

View File

@ -45,7 +45,7 @@ struct CheckpointMetaData {
constexpr static FileIdentifier file_identifier = 13804342;
Version version;
KeyRange range;
std::vector<KeyRange> ranges;
int16_t format; // CheckpointFormat.
UID ssID; // Storage server ID on which this checkpoint is created.
UID checkpointID; // A unique id for this checkpoint.
@ -58,11 +58,15 @@ struct CheckpointMetaData {
CheckpointMetaData() = default;
CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID)
: version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {}
: version(invalidVersion), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {
this->ranges.push_back(range);
}
CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID)
: version(version), range(range), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {}
: version(version), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending), referenceCount(0),
gcTime(0) {
this->ranges.push_back(range);
}
CheckpointState getState() const { return static_cast<CheckpointState>(state); }
@ -73,7 +77,7 @@ struct CheckpointMetaData {
void setFormat(CheckpointFormat format) { this->format = static_cast<int16_t>(format); }
std::string toString() const {
std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) +
std::string res = "Checkpoint MetaData:\nRange: " + describe(ranges) + "\nVersion: " + std::to_string(version) +
"\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() +
"\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast<int>(state)) +
"\n";
@ -82,7 +86,7 @@ struct CheckpointMetaData {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
serializer(ar, version, ranges, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
}
};
@ -99,23 +103,28 @@ struct DataMoveMetaData {
constexpr static FileIdentifier file_identifier = 13804362;
UID id; // A unique id for this data move.
Version version;
KeyRange range;
std::vector<KeyRange> ranges;
int priority;
std::set<UID> src;
std::set<UID> dest;
std::set<UID> checkpoints;
int16_t phase; // DataMoveMetaData::Phase.
int8_t mode;
DataMoveMetaData() = default;
DataMoveMetaData(UID id, Version version, KeyRange range)
: id(id), version(version), range(std::move(range)), priority(0) {}
DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), range(std::move(range)), priority(0) {}
DataMoveMetaData(UID id, Version version, KeyRange range) : id(id), version(version), priority(0), mode(0) {
this->ranges.push_back(range);
}
DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), priority(0), mode(0) {
this->ranges.push_back(range);
}
Phase getPhase() const { return static_cast<Phase>(phase); }
void setPhase(Phase phase) { this->phase = static_cast<int16_t>(phase); }
std::string toString() const {
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + range.toString() +
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + describe(ranges) +
" [Phase]: " + std::to_string(static_cast<int>(phase)) +
" [Source Servers]: " + describe(src) + " [Destination Servers]: " + describe(dest);
return res;
@ -123,7 +132,7 @@ struct DataMoveMetaData {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, version, range, phase, src, dest);
serializer(ar, id, version, ranges, priority, src, dest, checkpoints, phase, mode);
}
};

View File

@ -890,16 +890,16 @@ struct ChangeFeedStreamRequest {
KeyRange range;
int replyBufferSize = -1;
bool canReadPopped = true;
UID debugUID; // This is only used for debugging and tracing, but being able to link a client + server side stream
// is so useful for testing, and this is such small overhead compared to streaming large amounts of
// change feed data, it is left in the interface
UID id; // This must be globally unique among ChangeFeedStreamRequest instances
Optional<ReadOptions> options;
ReplyPromiseStream<ChangeFeedStreamReply> reply;
ChangeFeedStreamRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, debugUID, arena);
serializer(
ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, id, options, arena);
}
};

View File

@ -92,8 +92,6 @@ void decodeKeyServersValue(RangeResult result,
UID& destID,
bool missingIsError = true);
extern const KeyRef clusterIdKey;
extern const KeyRangeRef auditRange;
extern const KeyRef auditPrefix;
const Key auditRangeKey(const AuditType type, const UID& auditId, const KeyRef& key);
@ -505,6 +503,9 @@ extern const KeyRangeRef timeKeeperPrefixRange;
extern const KeyRef timeKeeperVersionKey;
extern const KeyRef timeKeeperDisableKey;
// Durable cluster ID key
extern const KeyRef clusterIdKey;
// Layer status metadata prefix
extern const KeyRangeRef layerStatusMetaPrefixRange;

View File

@ -68,6 +68,10 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
// 1. Lookup by 'TenantId'
// 2. Lookup by 'TenantPrefix'
// 3. Lookup by 'TenantName'
// TODO: Currently this cache performs poorly if there are tenant access happening to unknown tenants which happens most
// frequently in optional tenant mode but can also happen in required mode if there are alot of tenants created. Further
// as a consequence of the design we cannot be sure that the state of a given tenant is accurate even if its present in
// the cache.
template <class T>
class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {

View File

@ -273,17 +273,4 @@ struct ITracer {
virtual void trace(Span const& span) = 0;
};
void openTracer(TracerType type);
template <class T>
struct SpannedDeque : Deque<T> {
Span span;
explicit SpannedDeque(Location loc) : span(loc) {}
SpannedDeque(SpannedDeque&& other) : Deque<T>(std::move(other)), span(std::move(other.span)) {}
SpannedDeque(SpannedDeque const&) = delete;
SpannedDeque& operator=(SpannedDeque const&) = delete;
SpannedDeque& operator=(SpannedDeque&& other) {
*static_cast<Deque<T>*>(this) = std::move(other);
span = std::move(other.span);
}
};
void openTracer(TracerType type);

View File

@ -279,7 +279,7 @@ description is not currently required but encouraged.
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="idempotency_id" code="504"
paramType="String" paramDescription="Unique ID"
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use. Unless the automatic_idempotency option is set after this option, the client will not automatically attempt to remove this id from the cluster after a successful commit."
hidden="true" />
<Option name="automatic_idempotency" code="505"
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."

View File

@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
strip_debug_symbols(fdbmonitor)
assert_no_version_h(fdbmonitor)
if(UNIX AND NOT APPLE)
target_link_libraries(fdbmonitor PRIVATE rt)
target_link_libraries(fdbmonitor PRIVATE rt)
endif()
# FIXME: This include directory is an ugly hack. We probably want to fix this.
# as soon as we get rid of the old build system
@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads)
# appears to change its behavior (it no longer seems to restart killed
# processes). fdbmonitor is single-threaded anyway.
get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options})
endif ()
endif()
get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options})
endif ()
endif()
if(GENERATE_DEBUG_PACKAGES)
fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
@ -51,7 +50,7 @@ add_custom_target(clean_sandbox
add_custom_target(start_sandbox
COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
add_dependencies(start_sandbox fdbmonitor fdbserver)
@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
endif()
add_custom_target(generate_profile
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)

View File

@ -24,8 +24,8 @@
Counter::Counter(std::string const& name, CounterCollection& collection)
: name(name), interval_start(0), last_event(0), interval_sq_time(0), roughness_interval_start(0), interval_delta(0),
interval_start_value(0) {
metric.init(collection.name + "." + (char)toupper(name.at(0)) + name.substr(1), collection.id);
collection.counters.push_back(this);
metric.init(collection.getName() + "." + (char)toupper(name.at(0)) + name.substr(1), collection.getId());
collection.addCounter(this);
}
void Counter::operator+=(Value delta) {
@ -88,36 +88,48 @@ void CounterCollection::logToTraceEvent(TraceEvent& te) const {
}
}
ACTOR Future<Void> traceCounters(std::string traceEventName,
UID traceEventID,
double interval,
CounterCollection* counters,
std::string trackLatestName,
std::function<void(TraceEvent&)> decorator) {
wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
class CounterCollectionImpl {
public:
ACTOR static Future<Void> traceCounters(CounterCollection* counters,
std::string traceEventName,
UID traceEventID,
double interval,
std::string trackLatestName,
std::function<void(TraceEvent&)> decorator) {
wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
for (ICounter* c : counters->counters)
c->resetInterval();
state Reference<EventCacheHolder> traceEventHolder;
if (!trackLatestName.empty()) {
traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
}
state double last_interval = now();
loop {
TraceEvent te(traceEventName.c_str(), traceEventID);
te.detail("Elapsed", now() - last_interval);
counters->logToTraceEvent(te);
decorator(te);
for (ICounter* c : counters->counters)
c->resetInterval();
state Reference<EventCacheHolder> traceEventHolder;
if (!trackLatestName.empty()) {
te.trackLatest(traceEventHolder->trackingKey);
traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
}
last_interval = now();
wait(delay(interval, TaskPriority::FlushTrace));
state double last_interval = now();
loop {
TraceEvent te(traceEventName.c_str(), traceEventID);
te.detail("Elapsed", now() - last_interval);
counters->logToTraceEvent(te);
decorator(te);
if (!trackLatestName.empty()) {
te.trackLatest(traceEventHolder->trackingKey);
}
last_interval = now();
wait(delay(interval, TaskPriority::FlushTrace));
}
}
};
Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
UID traceEventID,
double interval,
std::string const& trackLatestName,
std::function<void(TraceEvent&)> const& decorator) {
return CounterCollectionImpl::traceCounters(
this, traceEventName, traceEventID, interval, trackLatestName, decorator);
}

View File

@ -757,12 +757,18 @@ Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const BasicLoadBalanc
Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const void*);
// A simpler version of LoadBalance that does not send second requests where the list of servers are always fresh
//
// If |alternativeChosen| is not null, then atMostOnce must be True, and if the returned future completes successfully
// then *alternativeChosen will be the alternative to which the message was sent. *alternativeChosen must outlive the
// returned future.
ACTOR template <class Interface, class Request, class Multi, bool P>
Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> alternatives,
RequestStream<Request, P> Interface::*channel,
Request request = Request(),
TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
AtMostOnce atMostOnce = AtMostOnce::False) {
AtMostOnce atMostOnce = AtMostOnce::False,
int* alternativeChosen = nullptr) {
ASSERT(alternativeChosen == nullptr || atMostOnce == AtMostOnce::True);
setReplyPriority(request, taskID);
if (!alternatives)
return Never();
@ -791,6 +797,9 @@ Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> al
useAlt = (nextAlt + alternatives->size() - 1) % alternatives->size();
stream = &alternatives->get(useAlt, channel);
if (alternativeChosen != nullptr) {
*alternativeChosen = useAlt;
}
if (!IFailureMonitor::failureMonitor().getState(stream->getEndpoint()).failed)
break;
nextAlt = (nextAlt + 1) % alternatives->size();

View File

@ -67,17 +67,37 @@ struct Traceable<ICounter*> : std::true_type {
}
};
struct CounterCollection {
CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {}
std::vector<struct ICounter*> counters, counters_to_remove;
~CounterCollection() {
for (auto c : counters_to_remove)
c->remove();
}
class CounterCollection {
friend class CounterCollectionImpl;
std::string name;
std::string id;
std::vector<struct ICounter*> counters, countersToRemove;
public:
CounterCollection(std::string const& name, std::string const& id = std::string()) : name(name), id(id) {}
~CounterCollection() {
for (auto c : countersToRemove)
c->remove();
}
void addCounter(ICounter* counter) { counters.push_back(counter); }
// Call remove method on this counter in ~CounterCollection
void markForRemoval(ICounter* counter) { countersToRemove.push_back(counter); }
std::string const& getName() const { return name; }
std::string const& getId() const { return id; }
void logToTraceEvent(TraceEvent& te) const;
Future<Void> traceCounters(
std::string const& traceEventName,
UID traceEventID,
double interval,
std::string const& trackLatestName = std::string(),
std::function<void(TraceEvent&)> const& decorator = [](auto& te) {});
};
struct Counter final : ICounter, NonCopyable {
@ -131,8 +151,8 @@ struct Traceable<Counter> : std::true_type {
template <class F>
struct SpecialCounter final : ICounter, FastAllocated<SpecialCounter<F>>, NonCopyable {
SpecialCounter(CounterCollection& collection, std::string const& name, F&& f) : name(name), f(f) {
collection.counters.push_back(this);
collection.counters_to_remove.push_back(this);
collection.addCounter(this);
collection.markForRemoval(this);
}
void remove() override { delete this; }
@ -162,14 +182,6 @@ static void specialCounter(CounterCollection& collection, std::string const& nam
new SpecialCounter<F>(collection, name, std::move(f));
}
Future<Void> traceCounters(
std::string const& traceEventName,
UID const& traceEventID,
double const& interval,
CounterCollection* const& counters,
std::string const& trackLatestName = std::string(),
std::function<void(TraceEvent&)> const& decorator = [](TraceEvent& te) {});
class LatencyBands {
public:
LatencyBands(std::string name, UID id, double loggingInterval)
@ -180,7 +192,7 @@ public:
if (bands.size() == 0) {
ASSERT(!cc && !filteredCount);
cc = std::make_unique<CounterCollection>(name, id.toString());
logger = traceCounters(name, id, loggingInterval, cc.get(), id.toString() + "/" + name);
logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name);
filteredCount = std::make_unique<Counter>("Filtered", *cc);
insertBand(std::numeric_limits<double>::infinity());
}

View File

@ -54,6 +54,7 @@ public:
FailDisk,
RebootAndDelete,
RebootProcessAndDelete,
RebootProcessAndSwitch,
Reboot,
RebootProcess,
None
@ -104,6 +105,7 @@ public:
bool excluded;
bool cleared;
bool rebooting;
bool drProcess;
std::vector<flowGlobalType> globals;
INetworkConnections* network;
@ -128,8 +130,8 @@ public:
const char* coordinationFolder)
: name(name), coordinationFolder(coordinationFolder), dataFolder(dataFolder), machine(nullptr),
addresses(addresses), address(addresses.address), locality(locality), startingClass(startingClass),
failed(false), excluded(false), cleared(false), rebooting(false), network(net), fault_injection_r(0),
fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
failed(false), excluded(false), cleared(false), rebooting(false), drProcess(false), network(net),
fault_injection_r(0), fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
uid = deterministicRandom()->randomUniqueID();
}
@ -283,7 +285,8 @@ public:
ProcessClass startingClass,
const char* dataFolder,
const char* coordinationFolder,
ProtocolVersion protocol) = 0;
ProtocolVersion protocol,
bool drProcess) = 0;
virtual void killProcess(ProcessInfo* machine, KillType) = 0;
virtual void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses) = 0;
virtual void rebootProcess(ProcessInfo* process, KillType kt) = 0;
@ -304,6 +307,7 @@ public:
KillType kt,
bool forceKill = false,
KillType* ktFinal = nullptr) = 0;
virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0;
// virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
std::vector<ProcessInfo*> const& deadProcesses,
@ -390,6 +394,13 @@ public:
return clearedAddresses.find(address) != clearedAddresses.end();
}
void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; }
bool hasSwitchedCluster(NetworkAddress const& address) const {
return switchedCluster.find(address) != switchedCluster.end() ? switchedCluster.at(address) : false;
}
void toggleGlobalSwitchCluster() { globalSwitchedCluster = !globalSwitchedCluster; }
bool globalHasSwitchedCluster() const { return globalSwitchedCluster; }
void excludeAddress(NetworkAddress const& address) {
excludedAddresses[address]++;
TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
@ -540,6 +551,8 @@ private:
std::set<Optional<Standalone<StringRef>>> swapsDisabled;
std::map<NetworkAddress, int> excludedAddresses;
std::map<NetworkAddress, int> clearedAddresses;
std::map<NetworkAddress, bool> switchedCluster;
bool globalSwitchedCluster = false;
std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
std::map<std::string, double> disabledMap;
bool allSwapsDisabled;

View File

@ -1261,7 +1261,8 @@ public:
ProcessClass startingClass,
const char* dataFolder,
const char* coordinationFolder,
ProtocolVersion protocol) override {
ProtocolVersion protocol,
bool drProcess) override {
ASSERT(locality.machineId().present());
MachineInfo& machine = machines[locality.machineId().get()];
if (!machine.machineId.present())
@ -1311,6 +1312,7 @@ public:
m->excluded = g_simulator->isExcluded(NetworkAddress(ip, port, true, false));
m->cleared = g_simulator->isCleared(addresses.address);
m->protocolVersion = protocol;
m->drProcess = drProcess;
m->setGlobal(enTDMetrics, (flowGlobalType)&m->tdmetrics);
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
@ -1324,7 +1326,8 @@ public:
.detail("Address", m->address)
.detail("MachineId", m->locality.machineId())
.detail("Excluded", m->excluded)
.detail("Cleared", m->cleared);
.detail("Cleared", m->cleared)
.detail("DrProcess", m->drProcess);
if (std::string(name) == "remote flow process") {
protectedAddresses.insert(m->address);
@ -1794,6 +1797,15 @@ public:
}
return result;
}
bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override {
bool result = false;
for (auto& machine : machines) {
if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) {
result = true;
}
}
return result;
}
bool killMachine(Optional<Standalone<StringRef>> machineId,
KillType kt,
bool forceKill,
@ -1816,6 +1828,7 @@ public:
}
int processesOnMachine = 0;
bool isMainCluster = true; // false for machines running DR processes
KillType originalKt = kt;
// Reboot if any of the processes are protected and count the number of processes not rebooting
@ -1824,6 +1837,9 @@ public:
kt = Reboot;
if (!process->rebooting)
processesOnMachine++;
if (process->drProcess) {
isMainCluster = false;
}
}
// Do nothing, if no processes to kill
@ -1950,8 +1966,13 @@ public:
probe::context::sim2,
probe::assert::simOnly);
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) {
if (isMainCluster && originalKt == RebootProcessAndSwitch) {
// When killing processes with the RebootProcessAndSwitch kill
// type, processes in the original cluster should be rebooted in
// order to kill any zombie processes.
kt = KillType::Reboot;
} else if (processesOnMachine != processesPerMachine && kt != RebootProcessAndSwitch) {
// Check if any processes on machine are rebooting
CODE_PROBE(true,
"Attempted reboot, but the target did not have all of its processes running",
probe::context::sim2,
@ -1968,24 +1989,6 @@ public:
return false;
}
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine) {
CODE_PROBE(true,
"Attempted reboot and kill, but the target did not have all of its processes running",
probe::context::sim2,
probe::assert::simOnly);
TraceEvent(SevWarn, "AbortedKill")
.detail("KillType", kt)
.detail("MachineId", machineId)
.detail("Reason", "Machine processes does not match number of processes per machine")
.detail("Processes", processesOnMachine)
.detail("ProcessesPerMachine", processesPerMachine)
.backtrace();
if (ktFinal)
*ktFinal = None;
return false;
}
TraceEvent("KillMachine")
.detail("MachineId", machineId)
.detail("Kt", kt)
@ -2008,7 +2011,7 @@ public:
if (process->startingClass != ProcessClass::TesterClass)
killProcess_internal(process, kt);
}
} else if (kt == Reboot || kt == RebootAndDelete) {
} else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) {
for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess")
.detail("KillType", kt)
@ -2564,7 +2567,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
try {
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
kt == ISimulator::RebootProcessAndDelete);
kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch);
CODE_PROBE(kt == ISimulator::RebootProcess,
"Simulated process rebooted",
@ -2580,6 +2583,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
"Simulated process rebooted with data and coordination state deletion",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch,
"Simulated process rebooted with different cluster file",
probe::assert::simOnly,
probe::context::sim2);
if (p->rebooting || !p->isReliable()) {
TraceEvent(SevDebug, "DoRebootFailed")
@ -2608,6 +2615,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
p->cleared = true;
g_simulator->clearAddress(p->address);
} else if (kt == ISimulator::RebootProcessAndSwitch) {
g_simulator->switchCluster(p->address);
}
p->shutdownSignal.send(kt);
} catch (Error& e) {

View File

@ -613,7 +613,7 @@ private:
m.param1.startsWith(applyMutationsAddPrefixRange.begin) ||
m.param1.startsWith(applyMutationsRemovePrefixRange.begin) || m.param1.startsWith(tagLocalityListPrefix) ||
m.param1.startsWith(serverTagHistoryPrefix) ||
m.param1.startsWith(testOnlyTxnStateStorePrefixRange.begin) || m.param1 == clusterIdKey) {
m.param1.startsWith(testOnlyTxnStateStorePrefixRange.begin)) {
txnStateStore->set(KeyValueRef(m.param1, m.param2));
}

View File

@ -290,8 +290,8 @@ struct BackupData {
specialCounter(cc, "MsgQ", [this]() { return this->messages.size(); });
specialCounter(cc, "BufferedBytes", [this]() { return this->lock->activePermits(); });
specialCounter(cc, "AvailableBytes", [this]() { return this->lock->available(); });
logger = traceCounters(
"BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "BackupWorkerMetrics");
logger =
cc.traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "BackupWorkerMetrics");
}
bool pullFinished() const { return endVersion.present() && pulledVersion.get() > endVersion.get(); }

View File

@ -0,0 +1,202 @@
/*
* BlobConnectionProviderTest.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BlobConnectionProvider.h"
#include "flow/UnitTest.h"
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h" // has to be last include
void forceLinkBlobConnectionProviderTests() {}
struct ConnectionProviderTestSettings {
uint32_t numProviders;
uint32_t filesPerProvider;
uint32_t maxFileMemory;
uint32_t maxFileSize;
uint32_t threads;
bool uniformProviderChoice;
double readWriteSplit;
double runtime;
int writeOps;
int readOps;
ConnectionProviderTestSettings() {
numProviders = deterministicRandom()->randomSkewedUInt32(1, 1000);
filesPerProvider =
1 + std::min((uint32_t)100, deterministicRandom()->randomSkewedUInt32(10, 10000) / numProviders);
maxFileMemory = 1024 * 1024 * 1024;
maxFileSize = maxFileMemory / (numProviders * filesPerProvider);
maxFileSize = deterministicRandom()->randomSkewedUInt32(8, std::min((uint32_t)(16 * 1024 * 1024), maxFileSize));
threads = deterministicRandom()->randomInt(16, 128);
uniformProviderChoice = deterministicRandom()->coinflip();
readWriteSplit = deterministicRandom()->randomInt(1, 10) / 10.0;
runtime = 60.0;
writeOps = 0;
readOps = 0;
}
};
struct ProviderTestData {
Reference<BlobConnectionProvider> provider;
std::vector<std::pair<std::string, Value>> data;
std::unordered_set<std::string> usedNames;
ProviderTestData() {}
explicit ProviderTestData(Reference<BlobConnectionProvider> provider) : provider(provider) {}
};
ACTOR Future<Void> createObject(ConnectionProviderTestSettings* settings, ProviderTestData* provider) {
// pick object name before wait so no collisions between concurrent writes
std::string objName;
loop {
objName = deterministicRandom()->randomAlphaNumeric(12);
if (provider->usedNames.insert(objName).second) {
break;
}
}
int randomDataSize = deterministicRandom()->randomInt(1, settings->maxFileSize);
state Value data = makeString(randomDataSize);
deterministicRandom()->randomBytes(mutateString(data), randomDataSize);
state Reference<BackupContainerFileSystem> bstore;
state std::string fullPath;
std::tie(bstore, fullPath) = provider->provider->createForWrite(objName);
state Reference<IBackupFile> file = wait(bstore->writeFile(fullPath));
wait(file->append(data.begin(), data.size()));
wait(file->finish());
// after write, put in the readable list
provider->data.push_back({ fullPath, data });
return Void();
}
ACTOR Future<Void> readAndVerifyObject(ProviderTestData* provider, std::string objFullPath, Value expectedData) {
Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
state Reference<IAsyncFile> reader = wait(bstore->readFile(objFullPath));
state Value actualData = makeString(expectedData.size());
int readSize = wait(reader->read(mutateString(actualData), expectedData.size(), 0));
ASSERT_EQ(expectedData.size(), readSize);
ASSERT(expectedData == actualData);
return Void();
}
Future<Void> deleteObject(ProviderTestData* provider, std::string objFullPath) {
Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
return bstore->deleteFile(objFullPath);
}
ACTOR Future<Void> workerThread(ConnectionProviderTestSettings* settings, std::vector<ProviderTestData>* providers) {
state double endTime = now() + settings->runtime;
try {
while (now() < endTime) {
// randomly pick provider
int providerIdx;
if (settings->uniformProviderChoice) {
providerIdx = deterministicRandom()->randomInt(0, providers->size());
} else {
providerIdx = deterministicRandom()->randomSkewedUInt32(0, providers->size());
}
ProviderTestData* provider = &(*providers)[providerIdx];
// randomly pick create or read
bool doWrite = deterministicRandom()->random01() < settings->readWriteSplit;
if (provider->usedNames.size() < settings->filesPerProvider && (provider->data.empty() || doWrite)) {
// create an object
wait(createObject(settings, provider));
settings->writeOps++;
} else if (!provider->data.empty()) {
// read a random object
auto& readInfo = provider->data[deterministicRandom()->randomInt(0, provider->data.size())];
wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
settings->readOps++;
} else {
// other threads are creating files up to filesPerProvider limit, but none finished yet. Just wait
wait(delay(0.1));
}
}
return Void();
} catch (Error& e) {
fmt::print("WorkerThread Unexpected Error {0}\n", e.name());
throw e;
}
}
ACTOR Future<Void> checkAndCleanUp(ProviderTestData* provider) {
state int i;
ASSERT(provider->usedNames.size() == provider->data.size());
for (i = 0; i < provider->data.size(); i++) {
auto& readInfo = provider->data[i];
wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
wait(deleteObject(provider, provider->data[i].first));
}
return Void();
}
// maybe this should be a workload instead?
TEST_CASE("/fdbserver/blob/connectionprovider") {
state ConnectionProviderTestSettings settings;
state std::vector<ProviderTestData> providers;
providers.reserve(settings.numProviders);
for (int i = 0; i < settings.numProviders; i++) {
std::string nameStr = std::to_string(i);
BlobMetadataDomainName name(nameStr);
auto metadata = createRandomTestBlobMetadata(SERVER_KNOBS->BG_URL, i, name);
providers.emplace_back(BlobConnectionProvider::newBlobConnectionProvider(metadata));
}
fmt::print("BlobConnectionProviderTest\n");
state std::vector<Future<Void>> futures;
futures.reserve(settings.threads);
for (int i = 0; i < settings.threads; i++) {
futures.push_back(workerThread(&settings, &providers));
}
wait(waitForAll(futures));
fmt::print("BlobConnectionProviderTest workload phase complete with {0} files and {1} reads\n",
settings.writeOps,
settings.readOps);
futures.clear();
futures.reserve(providers.size());
for (int i = 0; i < providers.size(); i++) {
futures.push_back(checkAndCleanUp(&providers[i]));
}
wait(waitForAll(futures));
fmt::print("BlobConnectionProviderTest check and cleanup phase complete\n");
return Void();
}

View File

@ -296,7 +296,7 @@ struct BlobManagerStats {
specialCounter(cc, "HardBoundaries", [mergeHardBoundaries]() { return mergeHardBoundaries->size(); });
specialCounter(cc, "SoftBoundaries", [mergeBoundaries]() { return mergeBoundaries->size(); });
specialCounter(cc, "BlockedAssignments", [this]() { return this->blockedAssignments; });
logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
logger = cc.traceCounters("BlobManagerMetrics", id, interval, "BlobManagerMetrics");
}
};

View File

@ -84,6 +84,15 @@ struct GranuleStartState {
Optional<GranuleHistory> history;
};
// TODO: add more (blob file request cost, in-memory mutations vs blob delta file, etc...)
struct GranuleReadStats {
int64_t deltaBytesRead;
void reset() { deltaBytesRead = 0; }
GranuleReadStats() { reset(); }
};
struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
KeyRange keyRange;
@ -120,11 +129,74 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
AssignBlobRangeRequest originalReq;
GranuleReadStats readStats;
bool rdcCandidate;
Promise<Void> runRDC;
void resume() {
if (resumeSnapshot.canBeSet()) {
resumeSnapshot.send(Void());
}
}
void resetReadStats() {
rdcCandidate = false;
readStats.reset();
runRDC.reset();
}
// determine eligibility (>1) and priority for re-snapshotting this granule
double weightRDC() {
// ratio of read amp to write amp that would be incurred by re-snapshotting now
int64_t lastSnapshotSize = (files.snapshotFiles.empty()) ? 0 : files.snapshotFiles.back().length;
int64_t minSnapshotSize = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2;
lastSnapshotSize = std::max(minSnapshotSize, lastSnapshotSize);
int64_t writeAmp = lastSnapshotSize + bufferedDeltaBytes + bytesInNewDeltaFiles;
// read amp is deltaBytesRead. Read amp must be READ_FACTOR times larger than write amp
return (1.0 * readStats.deltaBytesRead) / (writeAmp * SERVER_KNOBS->BG_RDC_READ_FACTOR);
}
bool isEligibleRDC() {
// granule should be reasonably read-hot to be eligible
int64_t bytesWritten = bufferedDeltaBytes + bytesInNewDeltaFiles;
return bytesWritten * SERVER_KNOBS->BG_RDC_READ_FACTOR < readStats.deltaBytesRead;
}
bool updateReadStats(Version readVersion, const BlobGranuleChunkRef& chunk) {
// Only update stats for re-compacting for at-latest reads that have to do snapshot + delta merge
if (!SERVER_KNOBS->BG_ENABLE_READ_DRIVEN_COMPACTION || !chunk.snapshotFile.present() ||
pendingSnapshotVersion != durableSnapshotVersion.get() || readVersion <= pendingSnapshotVersion) {
return false;
}
if (chunk.newDeltas.empty() && chunk.deltaFiles.empty()) {
return false;
}
readStats.deltaBytesRead += chunk.newDeltas.expectedSize();
for (auto& it : chunk.deltaFiles) {
readStats.deltaBytesRead += it.length;
}
if (rdcCandidate) {
return false;
}
if (isEligibleRDC() && weightRDC() > 1.0) {
rdcCandidate = true;
CODE_PROBE(true, "Granule read triggering read-driven compaction");
if (BW_DEBUG) {
fmt::print("Triggering read-driven compaction of [{0} - {1})\n",
keyRange.begin.printable(),
keyRange.end.printable());
}
return true;
}
return false;
}
inline bool doReadDrivenCompaction() { return runRDC.isSet(); }
};
struct GranuleRangeMetadata {
@ -200,6 +272,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
NotifiedVersion grvVersion;
Promise<Void> fatalError;
Promise<Void> simInjectFailure;
Promise<Void> doReadDrivenCompaction;
Reference<FlowLock> initialSnapshotLock;
Reference<FlowLock> resnapshotLock;
@ -293,6 +366,13 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
return stats.estimatedMaxResidentMemory >= memoryFullThreshold;
}
void triggerReadDrivenCompaction() {
Promise<Void> doRDC = doReadDrivenCompaction;
if (doRDC.canBeSet()) {
doRDC.send(Void());
}
}
bool maybeInjectTargetedRestart() {
// inject a BW restart at most once per test
if (g_network->isSimulated() && !g_simulator->speedUpSimulation &&
@ -1107,7 +1187,6 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
}
retries++;
CODE_PROBE(true, "Granule initial snapshot failed");
// FIXME: why can't we supress error event?
TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id)
.error(err)
.detail("Granule", metadata->keyRange)
@ -2043,6 +2122,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
metadata->pendingDeltaVersion = startVersion;
metadata->bufferedDeltaVersion = startVersion;
metadata->knownCommittedVersion = startVersion;
metadata->resetReadStats();
Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());
@ -2185,6 +2265,10 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
nextForceFlush = metadata->forceFlushVersion.whenAtLeast(lastForceFlushVersion + 1);
}
when(wait(metadata->runRDC.getFuture())) {
// return control flow back to the triggering actor before continuing
wait(delay(0));
}
}
} catch (Error& e) {
// only error we should expect here is when we finish consuming old change feed
@ -2311,6 +2395,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
startState.granuleID,
inFlightFiles.empty() ? Future<Void>(Void())
: success(inFlightFiles.back().future));
metadata->resetReadStats();
}
// reset force flush state, requests should retry and add it back once feed is ready
forceFlushVersions.clear();
@ -2419,20 +2504,20 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// The force flush contract is a version cannot be put in forceFlushVersion unless the change feed
// is already whenAtLeast that version
bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion;
bool doReadDrivenFlush = !metadata->currentDeltas.empty() && metadata->doReadDrivenCompaction();
CODE_PROBE(forceFlush, "Force flushing granule");
if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) {
if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush ||
doReadDrivenFlush) {
TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id)
.detail("Granule", metadata->keyRange)
.detail("Version", lastDeltaVersion);
// sanity check for version order
if (forceFlush) {
if (forceFlush || doReadDrivenFlush) {
if (lastDeltaVersion == invalidVersion) {
lastDeltaVersion = metadata->currentDeltas.empty() ? metadata->pendingDeltaVersion
: metadata->currentDeltas.back().version;
lastDeltaVersion = metadata->bufferedDeltaVersion;
}
if (lastDeltaVersion < forceFlushVersions.back()) {
if (!forceFlushVersions.empty() && lastDeltaVersion < forceFlushVersions.back()) {
if (BW_DEBUG) {
fmt::print("Granule [{0} - {1}) force flushing delta version {2} -> {3}\n",
metadata->keyRange.begin.printable(),
@ -2444,13 +2529,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
}
if (!metadata->currentDeltas.empty()) {
if (lastDeltaVersion < metadata->currentDeltas.back().version) {
fmt::print("Granule [{0} - {1}) LDV {2} < DeltaBack {3}\n",
metadata->keyRange.begin.printable(),
metadata->keyRange.end.printable(),
lastDeltaVersion,
metadata->currentDeltas.back().version);
}
ASSERT(lastDeltaVersion >= metadata->currentDeltas.back().version);
ASSERT(metadata->pendingDeltaVersion < metadata->currentDeltas.front().version);
} else {
@ -2507,6 +2585,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// add new pending delta file
ASSERT(metadata->pendingDeltaVersion < lastDeltaVersion);
metadata->pendingDeltaVersion = lastDeltaVersion;
ASSERT(metadata->bufferedDeltaVersion <= lastDeltaVersion);
metadata->bufferedDeltaVersion = lastDeltaVersion; // In case flush was forced at non-mutation version
metadata->bytesInNewDeltaFiles += metadata->bufferedDeltaBytes;
@ -2528,6 +2607,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// Wait on delta file starting here. If we have too many pending delta file writes, we need to not
// continue to consume from the change feed, as that will pile on even more delta files to write
wait(startDeltaFileWrite);
} else if (metadata->doReadDrivenCompaction()) {
ASSERT(metadata->currentDeltas.empty());
snapshotEligible = true;
}
// FIXME: if we're still reading from old change feed, we should probably compact if we're
@ -2535,7 +2617,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// yet
// If we have enough delta files, try to re-snapshot
if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) {
if (snapshotEligible && (metadata->doReadDrivenCompaction() ||
metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT)) {
if (BW_DEBUG && !inFlightFiles.empty()) {
fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, "
"waiting for "
@ -2583,6 +2666,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// reset metadata
metadata->bytesInNewDeltaFiles = 0;
metadata->resetReadStats();
// If we have more than one snapshot file and that file is unblocked (committedVersion >=
// snapshotVersion), wait for it to finish
@ -3740,6 +3824,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
}
}
}
// don't update read stats on a summarize read
if (metadata->updateReadStats(req.readVersion, chunk)) {
bwData->triggerReadDrivenCompaction();
}
}
rep.chunks.push_back(rep.arena, chunk);
@ -4554,6 +4643,74 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
}
}
struct RDCEntry {
double weight;
Reference<GranuleMetadata> granule;
RDCEntry(double weight, Reference<GranuleMetadata> granule) : weight(weight), granule(granule) {}
};
// for a top-k algorithm, we actually want a min-heap, so reverse the sort order
struct OrderForTopK {
bool operator()(RDCEntry const& a, RDCEntry const& b) const { return b.weight - a.weight; }
};
typedef std::priority_queue<RDCEntry, std::vector<RDCEntry>, OrderForTopK> TopKPQ;
ACTOR Future<Void> runReadDrivenCompaction(Reference<BlobWorkerData> bwData) {
state bool processedAll = true;
loop {
if (processedAll) {
wait(bwData->doReadDrivenCompaction.getFuture());
bwData->doReadDrivenCompaction.reset();
wait(delay(0));
}
TopKPQ topK;
// FIXME: possible to scan candidates instead of all granules?
int candidates = 0;
auto allRanges = bwData->granuleMetadata.intersectingRanges(normalKeys);
for (auto& it : allRanges) {
if (it.value().activeMetadata.isValid() && it.value().activeMetadata->cancelled.canBeSet()) {
auto metadata = it.value().activeMetadata;
if (metadata->rdcCandidate && metadata->isEligibleRDC() && metadata->runRDC.canBeSet() &&
metadata->pendingSnapshotVersion == metadata->durableSnapshotVersion.get()) {
candidates++;
double weight = metadata->weightRDC();
if (weight > 1.0 &&
(topK.size() < SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM || weight > topK.top().weight)) {
if (topK.size() == SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM) {
topK.pop();
}
topK.push(RDCEntry(weight, metadata));
}
}
}
}
CODE_PROBE(candidates > topK.size(), "Too many read-driven compaction candidates for one cycle");
std::vector<Future<Void>> futures;
futures.reserve(topK.size());
while (!topK.empty()) {
++bwData->stats.readDrivenCompactions;
Promise<Void> runRDC = topK.top().granule->runRDC;
ASSERT(runRDC.canBeSet());
Future<Void> waitForSnapshotComplete = topK.top().granule->durableSnapshotVersion.whenAtLeast(
topK.top().granule->durableSnapshotVersion.get() + 1) ||
topK.top().granule->cancelled.getFuture();
futures.push_back(waitForSnapshotComplete);
topK.pop();
runRDC.send(Void());
}
processedAll = futures.empty();
if (!futures.empty()) {
// wait at least one second to throttle this actor a bit
wait(waitForAll(futures) && delay(1.0));
}
}
}
// FIXME: better way to do this?
// monitor system keyspace for new tenants
ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
@ -4891,6 +5048,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
self->addActor.send(runGRVChecks(self));
self->addActor.send(monitorTenants(self));
self->addActor.send(runReadDrivenCompaction(self));
state Future<Void> selfRemoved = monitorRemoval(self);
if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.25)) {
self->addActor.send(simForceFileWriteContention(self));
@ -5024,13 +5182,22 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
ASSERT(false);
throw internal_error();
}
when(wait(selfRemoved || self->simInjectFailure.getFuture())) {
when(wait(selfRemoved)) {
if (BW_DEBUG) {
printf("Blob worker detected removal. Exiting...\n");
}
TraceEvent("BlobWorkerRemoved", self->id);
break;
}
when(wait(self->simInjectFailure.getFuture())) {
// wait to let triggering actor finish to prevent weird shutdown races
wait(delay(0));
if (BW_DEBUG) {
printf("Blob worker simulation injected failure. Exiting...\n");
}
TraceEvent("BlobWorkerSimRemoved", self->id);
break;
}
when(wait(self->fatalError.getFuture())) {
TraceEvent(SevError, "BlobWorkerActorCollectionFatalErrorNotError", self->id);
ASSERT(false);

View File

@ -1060,8 +1060,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
.detail("GrvProxies", req.grvProxies.size())
.detail("RecoveryCount", req.recoveryCount)
.detail("Stalled", req.recoveryStalled)
.detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch)
.detail("ClusterId", req.clusterId);
.detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch);
// make sure the request comes from an active database
auto db = &self->db;
@ -1120,8 +1119,9 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
// Construct the client information
if (db->clientInfo->get().commitProxies != req.commitProxies ||
db->clientInfo->get().grvProxies != req.grvProxies ||
db->clientInfo->get().tenantMode != db->config.tenantMode || db->clientInfo->get().clusterId != req.clusterId ||
db->clientInfo->get().tenantMode != db->config.tenantMode ||
db->clientInfo->get().isEncryptionEnabled != SERVER_KNOBS->ENABLE_ENCRYPTION ||
db->clientInfo->get().clusterId != db->serverInfo->get().client.clusterId ||
db->clientInfo->get().clusterType != db->clusterType ||
db->clientInfo->get().metaclusterName != db->metaclusterName ||
db->clientInfo->get().encryptKeyProxy != db->serverInfo->get().encryptKeyProxy) {
@ -1133,9 +1133,9 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
.detail("ReqCPs", req.commitProxies)
.detail("TenantMode", db->clientInfo->get().tenantMode.toString())
.detail("ReqTenantMode", db->config.tenantMode.toString())
.detail("ClusterId", db->clientInfo->get().clusterId)
.detail("ReqClusterId", req.clusterId)
.detail("EncryptionEnabled", SERVER_KNOBS->ENABLE_ENCRYPTION)
.detail("ClusterId", db->serverInfo->get().client.clusterId)
.detail("ClientClusterId", db->clientInfo->get().clusterId)
.detail("ClusterType", db->clientInfo->get().clusterType)
.detail("ReqClusterType", db->clusterType)
.detail("MetaclusterName", db->clientInfo->get().metaclusterName)
@ -1149,7 +1149,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
clientInfo.commitProxies = req.commitProxies;
clientInfo.grvProxies = req.grvProxies;
clientInfo.tenantMode = TenantAPI::tenantModeForClusterType(db->clusterType, db->config.tenantMode);
clientInfo.clusterId = req.clusterId;
clientInfo.clusterId = db->serverInfo->get().client.clusterId;
clientInfo.clusterType = db->clusterType;
clientInfo.metaclusterName = db->metaclusterName;
db->clientInfo->set(clientInfo);
@ -1228,6 +1228,17 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
std::vector<NetworkAddress> coordinatorAddresses = wait(cs.tryResolveHostnames());
const WorkerInterface& w = req.wi;
if (req.clusterId.present() && self->clusterId->get().present() && req.clusterId != self->clusterId->get() &&
req.processClass != ProcessClass::TesterClass) {
TraceEvent(g_network->isSimulated() ? SevWarnAlways : SevError, "WorkerBelongsToExistingCluster", self->id)
.detail("WorkerClusterId", req.clusterId)
.detail("ClusterControllerClusterId", self->clusterId->get())
.detail("WorkerId", w.id())
.detail("ProcessId", w.locality.processId());
req.reply.sendError(invalid_cluster_id());
return Void();
}
ProcessClass newProcessClass = req.processClass;
auto info = self->id_worker.find(w.locality.processId());
ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo;
@ -2964,13 +2975,64 @@ ACTOR Future<Void> metaclusterMetricsUpdater(ClusterControllerData* self) {
}
}
// Update the DBInfo state with this processes cluster ID. If this process does
// not have a cluster ID and one does not exist in the database, generate one.
ACTOR Future<Void> updateClusterId(ClusterControllerData* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
loop {
try {
state Optional<UID> durableClusterId = self->clusterId->get();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> clusterIdVal = wait(tr->get(clusterIdKey));
if (clusterIdVal.present()) {
UID clusterId = BinaryReader::fromStringRef<UID>(clusterIdVal.get(), IncludeVersion());
if (durableClusterId.present()) {
// If this process has an on disk file for the cluster ID,
// verify it matches the value in the database.
ASSERT(clusterId == durableClusterId.get());
} else {
// Otherwise, write the cluster ID in the database to the
// DbInfo object so all clients will learn of the cluster
// ID.
durableClusterId = clusterId;
}
} else if (!durableClusterId.present()) {
// No cluster ID exists in the database or on the machine. Generate and set one.
ASSERT(!durableClusterId.present());
durableClusterId = deterministicRandom()->randomUniqueID();
tr->set(clusterIdKey, BinaryWriter::toValue(durableClusterId.get(), IncludeVersion()));
wait(tr->commit());
}
auto serverInfo = self->db.serverInfo->get();
if (!serverInfo.client.clusterId.isValid()) {
ASSERT(durableClusterId.present());
serverInfo.id = deterministicRandom()->randomUniqueID();
serverInfo.client.clusterId = durableClusterId.get();
self->db.serverInfo->set(serverInfo);
ClientDBInfo clientInfo = self->db.clientInfo->get();
clientInfo.id = deterministicRandom()->randomUniqueID();
clientInfo.clusterId = durableClusterId.get();
self->db.clientInfo->set(clientInfo);
}
return Void();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
Future<Void> leaderFail,
ServerCoordinators coordinators,
LocalityData locality,
ConfigDBType configDBType,
Future<Void> recoveredDiskFiles) {
state ClusterControllerData self(interf, locality, coordinators);
Future<Void> recoveredDiskFiles,
Reference<AsyncVar<Optional<UID>>> clusterId) {
state ClusterControllerData self(interf, locality, coordinators, clusterId);
state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
state uint64_t step = 0;
state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
@ -3007,11 +3069,11 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self));
self.addActor.send(traceCounters("ClusterControllerMetrics",
self.id,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&self.clusterControllerMetrics,
self.id.toString() + "/ClusterControllerMetrics"));
self.addActor.send(updateClusterId(&self));
self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics",
self.id,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
self.id.toString() + "/ClusterControllerMetrics"));
self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
@ -3124,7 +3186,8 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
LocalityData locality,
ConfigDBType configDBType,
Future<Void> recoveredDiskFiles) {
Future<Void> recoveredDiskFiles,
Reference<AsyncVar<Optional<UID>>> clusterId) {
loop {
state ClusterControllerFullInterface cci;
state bool inRole = false;
@ -3151,7 +3214,8 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
inRole = true;
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType, recoveredDiskFiles));
wait(clusterControllerCore(
cci, leaderFail, coordinators, locality, configDBType, recoveredDiskFiles, clusterId));
}
} catch (Error& e) {
if (inRole)
@ -3175,7 +3239,8 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
Future<Void> recoveredDiskFiles,
LocalityData locality,
ConfigDBType configDBType) {
ConfigDBType configDBType,
Reference<AsyncVar<Optional<UID>>> clusterId) {
// Defer this wait optimization of cluster configuration has 'Encryption data at-rest' enabled.
// Encryption depends on available of EncryptKeyProxy (EKP) FDB role to enable fetch/refresh of encryption keys
@ -3195,8 +3260,14 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
loop {
try {
ServerCoordinators coordinators(connRecord, configDBType);
wait(clusterController(
coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType, recoveredDiskFiles));
wait(clusterController(coordinators,
currentCC,
hasConnected,
asyncPriorityInfo,
locality,
configDBType,
recoveredDiskFiles,
clusterId));
hasConnected = true;
} catch (Error& e) {
if (e.code() != error_code_coordinators_changed)
@ -3214,7 +3285,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
state ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
state NetworkAddress workerAddress(IPAddress(0x01010101), 1);
state NetworkAddress badPeer1(IPAddress(0x02020202), 1);
state NetworkAddress badPeer2(IPAddress(0x03030303), 1);
@ -3309,7 +3381,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateRecoveredWorkers") {
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress worker1(IPAddress(0x01010101), 1);
NetworkAddress worker2(IPAddress(0x11111111), 1);
NetworkAddress badPeer1(IPAddress(0x02020202), 1);
@ -3358,7 +3431,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress worker(IPAddress(0x01010101), 1);
NetworkAddress badPeer1(IPAddress(0x02020202), 1);
NetworkAddress badPeer2(IPAddress(0x03030303), 1);
@ -3511,7 +3585,8 @@ TEST_CASE("/fdbserver/clustercontroller/recentRecoveryCountDueToHealth") {
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
ASSERT_EQ(data.recentRecoveryCountDueToHealth(), 0);
@ -3532,7 +3607,8 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerRecoveryDueToDegradedServer
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress master(IPAddress(0x01010101), 1);
NetworkAddress tlog(IPAddress(0x02020202), 1);
NetworkAddress satelliteTlog(IPAddress(0x03030303), 1);
@ -3668,7 +3744,8 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress master(IPAddress(0x01010101), 1);
NetworkAddress tlog(IPAddress(0x02020202), 1);
NetworkAddress satelliteTlog(IPAddress(0x03030303), 1);

View File

@ -297,7 +297,6 @@ ACTOR Future<Void> newTLogServers(Reference<ClusterRecoveryData> self,
self->logSystem = Reference<ILogSystem>(); // Cancels the actors in the previous log system.
Reference<ILogSystem> newLogSystem = wait(oldLogSystem->newEpoch(recr,
fRemoteWorkers,
self->clusterId,
self->configuration,
self->cstate.myDBState.recoveryCount + 1,
self->recoveryTransactionVersion,
@ -311,7 +310,6 @@ ACTOR Future<Void> newTLogServers(Reference<ClusterRecoveryData> self,
self->logSystem = Reference<ILogSystem>(); // Cancels the actors in the previous log system.
Reference<ILogSystem> newLogSystem = wait(oldLogSystem->newEpoch(recr,
Never(),
self->clusterId,
self->configuration,
self->cstate.myDBState.recoveryCount + 1,
self->recoveryTransactionVersion,
@ -347,7 +345,6 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
isr.storeType = self->configuration.storageServerStoreType;
isr.reqId = deterministicRandom()->randomUniqueID();
isr.interfaceId = deterministicRandom()->randomUniqueID();
isr.clusterId = self->clusterId;
isr.initialClusterVersion = self->recoveryTransactionVersion;
ErrorOr<InitializeStorageReply> newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr));
@ -477,7 +474,6 @@ ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
self->dbgid)
.detail("StatusCode", RecoveryStatus::fully_recovered)
.detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
.detail("ClusterId", self->clusterId)
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_GENERATION_EVENT_NAME).c_str(),
@ -786,7 +782,6 @@ Future<Void> sendMasterRegistration(ClusterRecoveryData* self,
masterReq.priorCommittedLogServers = priorCommittedLogServers;
masterReq.recoveryState = self->recoveryState;
masterReq.recoveryStalled = self->recruitmentStalled->get();
masterReq.clusterId = self->clusterId;
return brokenPromiseToNever(self->clusterController.registerMaster.getReply(masterReq));
}
@ -1350,8 +1345,7 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
Reference<ILogSystem> oldLogSystem,
std::vector<StorageServerInterface>* seedServers,
std::vector<Standalone<CommitTransactionRef>>* initialConfChanges,
Future<Version> poppedTxsVersion,
bool* clusterIdExists) {
Future<Version> poppedTxsVersion) {
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid)
.detail("StatusCode", RecoveryStatus::reading_transaction_system_state)
.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state])
@ -1375,16 +1369,6 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
debug_checkMaxRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery");
// Generate a cluster ID to uniquely identify the cluster if it doesn't
// already exist in the txnStateStore.
Optional<Value> clusterId = self->txnStateStore->readValue(clusterIdKey).get();
*clusterIdExists = clusterId.present();
if (!clusterId.present()) {
self->clusterId = deterministicRandom()->randomUniqueID();
} else {
self->clusterId = BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
}
// Ordinarily we pass through this loop once and recover. We go around the loop if recovery stalls for more than a
// second, a provisional master is initialized, and an "emergency transaction" is submitted that might change the
// configuration so that we can finish recovery.
@ -1540,7 +1524,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
state Future<Void> logChanges;
state Future<Void> minRecoveryDuration;
state Future<Version> poppedTxsVersion;
state bool clusterIdExists = false;
loop {
Reference<ILogSystem> oldLogSystem = oldLogSystems->get();
@ -1556,13 +1539,9 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
self->registrationTrigger.trigger();
choose {
when(wait(oldLogSystem ? recoverFrom(self,
oldLogSystem,
&seedServers,
&initialConfChanges,
poppedTxsVersion,
std::addressof(clusterIdExists))
: Never())) {
when(wait(oldLogSystem
? recoverFrom(self, oldLogSystem, &seedServers, &initialConfChanges, poppedTxsVersion)
: Never())) {
reg.cancel();
break;
}
@ -1591,7 +1570,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
.detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction])
.detail("PrimaryLocality", self->primaryLocality)
.detail("DcId", self->masterInterface.locality.dcId())
.detail("ClusterId", self->clusterId)
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
// Recovery transaction
@ -1680,11 +1658,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
}
}
// Write cluster ID into txnStateStore if it is missing.
if (!clusterIdExists) {
tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned()));
}
applyMetadataMutations(SpanContext(),
self->dbgid,
recoveryCommitRequest.arena,

View File

@ -28,7 +28,7 @@
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/NativeAPI.actor.h"
@ -1616,6 +1616,14 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
self->toCommit.writeTypedMessage(idempotencyIdSet);
});
for (const auto& m : pProxyCommitData->idempotencyClears) {
auto& tags = pProxyCommitData->tagsForKey(m.param1);
self->toCommit.addTags(tags);
// TODO(nwijetunga): Encrypt these mutations
self->toCommit.writeTypedMessage(m);
}
pProxyCommitData->idempotencyClears = Standalone<VectorRef<MutationRef>>();
self->toCommit.saveTags(self->writtenTags);
pProxyCommitData->stats.mutations += self->mutationCount;
@ -1864,10 +1872,14 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
// Reset all to zero, used to track the correct index of each commitTransacitonRef on each resolver
std::fill(self->nextTr.begin(), self->nextTr.end(), 0);
std::unordered_map<uint8_t, int16_t> idCountsForKey;
for (int t = 0; t < self->trs.size(); t++) {
auto& tr = self->trs[t];
if (self->committed[t] == ConflictBatch::TransactionCommitted && (!self->locked || tr.isLockAware())) {
ASSERT_WE_THINK(self->commitVersion != invalidVersion);
if (self->trs[t].idempotencyId.valid()) {
idCountsForKey[uint8_t(t >> 8)] += 1;
}
tr.reply.send(CommitID(self->commitVersion, t, self->metadataVersionAfter));
} else if (self->committed[t] == ConflictBatch::TransactionTooOld) {
tr.reply.sendError(transaction_too_old());
@ -1914,6 +1926,11 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
}
}
for (auto [highOrderBatchIndex, count] : idCountsForKey) {
pProxyCommitData->expectedIdempotencyIdCountForKey.send(
ExpectedIdempotencyIdCountForKey{ self->commitVersion, count, highOrderBatchIndex });
}
++pProxyCommitData->stats.commitBatchOut;
pProxyCommitData->stats.txnCommitOut += self->trs.size();
pProxyCommitData->stats.txnConflicts += self->trs.size() - self->commitCount;
@ -2469,6 +2486,96 @@ ACTOR Future<Void> reportTxnTagCommitCost(UID myID,
}
}
namespace {
struct ExpireServerEntry {
int64_t timeReceived;
int expectedCount = 0;
int receivedCount = 0;
bool initialized = false;
};
struct IdempotencyKey {
Version version;
uint8_t highOrderBatchIndex;
bool operator==(const IdempotencyKey& other) const {
return version == other.version && highOrderBatchIndex == other.highOrderBatchIndex;
}
};
} // namespace
namespace std {
template <>
struct hash<IdempotencyKey> {
std::size_t operator()(const IdempotencyKey& key) const {
std::size_t seed = 0;
boost::hash_combine(seed, std::hash<Version>{}(key.version));
boost::hash_combine(seed, std::hash<uint8_t>{}(key.highOrderBatchIndex));
return seed;
}
};
} // namespace std
ACTOR static Future<Void> idempotencyIdsExpireServer(
Database db,
PublicRequestStream<ExpireIdempotencyIdRequest> expireIdempotencyId,
PromiseStream<ExpectedIdempotencyIdCountForKey> expectedIdempotencyIdCountForKey,
Standalone<VectorRef<MutationRef>>* idempotencyClears) {
state std::unordered_map<IdempotencyKey, ExpireServerEntry> idStatus;
state std::unordered_map<IdempotencyKey, ExpireServerEntry>::iterator iter;
state int64_t purgeBefore;
state IdempotencyKey key;
state ExpireServerEntry* status = nullptr;
state Future<Void> purgeOld = Void();
loop {
choose {
when(ExpireIdempotencyIdRequest req = waitNext(expireIdempotencyId.getFuture())) {
key = IdempotencyKey{ req.commitVersion, req.batchIndexHighByte };
status = &idStatus[key];
status->receivedCount += 1;
CODE_PROBE(status->expectedCount == 0, "ExpireIdempotencyIdRequest received before count is known");
if (status->expectedCount > 0) {
ASSERT_LE(status->receivedCount, status->expectedCount);
}
}
when(ExpectedIdempotencyIdCountForKey req = waitNext(expectedIdempotencyIdCountForKey.getFuture())) {
key = IdempotencyKey{ req.commitVersion, req.batchIndexHighByte };
status = &idStatus[key];
ASSERT_EQ(status->expectedCount, 0);
status->expectedCount = req.idempotencyIdCount;
}
when(wait(purgeOld)) {
purgeOld = delay(SERVER_KNOBS->IDEMPOTENCY_ID_IN_MEMORY_LIFETIME);
purgeBefore = now() - SERVER_KNOBS->IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
for (iter = idStatus.begin(); iter != idStatus.end();) {
// We have exclusive access to idStatus in this when block, so iter will still be valid after the
// wait
wait(yield());
if (iter->second.timeReceived < purgeBefore) {
iter = idStatus.erase(iter);
} else {
++iter;
}
}
continue;
}
}
if (status->initialized) {
if (status->receivedCount == status->expectedCount) {
auto keyRange =
makeIdempotencySingleKeyRange(idempotencyClears->arena(), key.version, key.highOrderBatchIndex);
idempotencyClears->push_back(idempotencyClears->arena(),
MutationRef(MutationRef::ClearRange, keyRange.begin, keyRange.end));
idStatus.erase(key);
}
} else {
status->timeReceived = now();
status->initialized = true;
}
}
}
namespace {
struct TransactionStateResolveContext {
@ -2733,6 +2840,10 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
addActor.send(rejoinServer(proxy, &commitData));
addActor.send(ddMetricsRequestServer(proxy, db));
addActor.send(reportTxnTagCommitCost(proxy.id(), db, &commitData.ssTrTagCommitCost));
addActor.send(idempotencyIdsExpireServer(openDBOnServer(db),
proxy.expireIdempotencyId,
commitData.expectedIdempotencyIdCountForKey,
&commitData.idempotencyClears));
// wait for txnStateStore recovery
wait(success(commitData.txnStateStore->readValue(StringRef())));

View File

@ -183,8 +183,8 @@ class ConfigBroadcasterImpl {
id(deterministicRandom()->randomUniqueID()), cc("ConfigBroadcaster"), compactRequest("CompactRequest", cc),
successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
snapshotRequest("SnapshotRequest", cc) {
logger = traceCounters(
"ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigBroadcasterMetrics");
logger = cc.traceCounters(
"ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigBroadcasterMetrics");
}
void addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,

View File

@ -812,7 +812,7 @@ public:
successfulCommits("SuccessfulCommits", cc), failedCommits("FailedCommits", cc),
setMutations("SetMutations", cc), clearMutations("ClearMutations", cc),
getValueRequests("GetValueRequests", cc), getGenerationRequests("GetGenerationRequests", cc) {
logger = traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigNode");
logger = cc.traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigNode");
TraceEvent(SevInfo, "StartingConfigNode", id).detail("KVStoreAlreadyExists", kvStore.exists());
}

View File

@ -29,7 +29,7 @@
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/ServerDBInfo.h"
@ -393,6 +393,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
state double rateLimiterStartTime = now();
state int64_t bytesReadInthisRound = 0;
state bool resume = !(restart || shuffleShards);
state bool testResult = true;
state double dbSize = 100e12;
if (g_network->isSimulated()) {
@ -710,7 +711,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
(!storageServerInterfaces[j].isTss() &&
!storageServerInterfaces[firstValidServer].isTss())) {
testFailure("Data inconsistent", performQuiescentChecks, true);
return false;
testResult = false;
}
}
}
@ -949,7 +950,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
}
*bytesReadInPrevRound = bytesReadInthisRound;
return true;
return testResult;
}
ACTOR Future<Void> runDataValidationCheck(ConsistencyScanData* self) {

View File

@ -212,7 +212,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
const Reference<AsyncVar<Optional<ShardMetrics>>>& shardMetrics,
const BandwidthStatus& bandwidthStatus,
PromiseStream<KeyRange> readHotShard) {
ShardSizeBounds bounds;
ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
if (shardMetrics->get().present()) {
auto bytes = shardMetrics->get().get().metrics.bytes;
auto readBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
@ -259,21 +259,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
} else {
ASSERT(false);
}
} else {
bounds.max.bytes = -1;
bounds.min.bytes = -1;
bounds.permittedError.bytes = -1;
bounds.max.bytesPerKSecond = bounds.max.infinity;
bounds.min.bytesPerKSecond = 0;
bounds.permittedError.bytesPerKSecond = bounds.permittedError.infinity;
bounds.max.bytesReadPerKSecond = bounds.max.infinity;
bounds.min.bytesReadPerKSecond = 0;
bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity;
}
bounds.max.iosPerKSecond = bounds.max.infinity;
bounds.min.iosPerKSecond = 0;
bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;
return bounds;
}

View File

@ -895,7 +895,7 @@ public:
if (maxPriority < SERVER_KNOBS->PRIORITY_TEAM_FAILED) {
std::pair<std::vector<ShardsAffectedByTeamFailure::Team>,
std::vector<ShardsAffectedByTeamFailure::Team>>
teams = self->shardsAffectedByTeamFailure->getTeamsFor(shards[i]);
teams = self->shardsAffectedByTeamFailure->getTeamsForFirstShard(shards[i]);
for (int j = 0; j < teams.first.size() + teams.second.size(); j++) {
// t is the team in primary DC or the remote DC
auto& t =
@ -2284,15 +2284,12 @@ public:
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
UID clusterId = wait(self->getClusterId());
state InitializeStorageRequest isr;
isr.storeType = recruitTss ? self->configuration.testingStorageServerStoreType
: self->configuration.storageServerStoreType;
isr.seedTag = invalidTag;
isr.reqId = deterministicRandom()->randomUniqueID();
isr.interfaceId = interfaceId;
isr.clusterId = clusterId;
// if tss, wait for pair ss to finish and add its id to isr. If pair fails, don't recruit tss
state bool doRecruit = true;
@ -3470,10 +3467,6 @@ Future<Void> DDTeamCollection::monitorHealthyTeams() {
return DDTeamCollectionImpl::monitorHealthyTeams(this);
}
Future<UID> DDTeamCollection::getClusterId() {
return db->getClusterId();
}
Future<UID> DDTeamCollection::getNextWigglingServerID() {
Optional<Value> localityKey;
Optional<Value> localityValue;

View File

@ -221,21 +221,6 @@ class DDTxnProcessorImpl {
}
}
ACTOR static Future<UID> getClusterId(Database cx) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> clusterId = wait(tr.get(clusterIdKey));
ASSERT(clusterId.present());
return BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Read keyservers, return unique set of teams
ACTOR static Future<Reference<InitialDataDistribution>> getInitialDataDistribution(
Database cx,
@ -319,6 +304,7 @@ class DDTxnProcessorImpl {
for (int i = 0; i < dms.size(); ++i) {
auto dataMove = std::make_shared<DataMove>(decodeDataMoveValue(dms[i].value), true);
const DataMoveMetaData& meta = dataMove->meta;
ASSERT(!meta.ranges.empty());
for (const UID& id : meta.src) {
auto& dc = server_dc[id];
if (std::find(remoteDcIds.begin(), remoteDcIds.end(), dc) != remoteDcIds.end()) {
@ -340,11 +326,11 @@ class DDTxnProcessorImpl {
std::sort(dataMove->primaryDest.begin(), dataMove->primaryDest.end());
std::sort(dataMove->remoteDest.begin(), dataMove->remoteDest.end());
auto ranges = result->dataMoveMap.intersectingRanges(meta.range);
auto ranges = result->dataMoveMap.intersectingRanges(meta.ranges.front());
for (auto& r : ranges) {
ASSERT(!r.value()->valid);
}
result->dataMoveMap.insert(meta.range, std::move(dataMove));
result->dataMoveMap.insert(meta.ranges.front(), std::move(dataMove));
++numDataMoves;
}
@ -675,10 +661,6 @@ Future<int> DDTxnProcessor::tryUpdateReplicasKeyForDc(const Optional<Key>& dcId,
return DDTxnProcessorImpl::tryUpdateReplicasKeyForDc(cx, dcId, storageTeamSize);
}
Future<UID> DDTxnProcessor::getClusterId() const {
return DDTxnProcessorImpl::getClusterId(cx);
}
Future<Void> DDTxnProcessor::waitDDTeamInfoPrintSignal() const {
return DDTxnProcessorImpl::waitDDTeamInfoPrintSignal(cx);
}
@ -763,7 +745,7 @@ std::vector<DDShardInfo> DDMockTxnProcessor::getDDShardInfos() const {
KeyRangeRef curRange = it->range();
DDShardInfo info(curRange.begin);
auto teams = mgs->shardMapping->getTeamsFor(curRange);
auto teams = mgs->shardMapping->getTeamsForFirstShard(curRange);
if (!teams.first.empty() && !teams.second.empty()) {
CODE_PROBE(true, "Mock InitialDataDistribution In-Flight shard");
info.hasDest = true;
@ -816,7 +798,7 @@ Future<Void> DDMockTxnProcessor::removeStorageServer(const UID& serverID,
const Optional<UID>& tssPairID,
const MoveKeysLock& lock,
const DDEnabledState* ddEnabledState) const {
ASSERT(mgs->allShardRemovedFromServer(serverID));
ASSERT(mgs->allShardsRemovedFromServer(serverID));
mgs->allServers.erase(serverID);
return Void();
}
@ -862,16 +844,14 @@ Future<HealthMetrics> DDMockTxnProcessor::getHealthMetrics(bool detailed) const
return Future<HealthMetrics>();
}
// FIXME: finish implementation
Future<Standalone<VectorRef<KeyRef>>> DDMockTxnProcessor::splitStorageMetrics(
const KeyRange& keys,
const StorageMetrics& limit,
const StorageMetrics& estimated,
const Optional<int>& minSplitBytes) const {
return Future<Standalone<VectorRef<KeyRef>>>();
return mgs->splitStorageMetrics(keys, limit, estimated, minSplitBytes);
}
// FIXME: finish implementation
Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorageMetrics(
const KeyRange& keys,
const StorageMetrics& min,
@ -879,7 +859,7 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
const StorageMetrics& permittedError,
int shardLimit,
int expectedShardCount) const {
return Future<std::pair<Optional<StorageMetrics>, int>>();
return mgs->waitStorageMetrics(keys, min, max, permittedError, shardLimit, expectedShardCount);
}
// FIXME: finish implementation
@ -910,7 +890,7 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
ASSERT(params.finishMoveKeysParallelismLock->take().isReady());
// get source and dest teams
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);
ASSERT_EQ(destTeams.size(), 0);
if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {

View File

@ -53,6 +53,20 @@
#include "fdbserver/DDSharedContext.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() {
return ShardSizeBounds{
.max = StorageMetrics{ .bytes = -1,
.bytesPerKSecond = StorageMetrics::infinity,
.iosPerKSecond = StorageMetrics::infinity,
.bytesReadPerKSecond = StorageMetrics::infinity },
.min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
.permittedError = StorageMetrics{ .bytes = -1,
.bytesPerKSecond = StorageMetrics::infinity,
.iosPerKSecond = StorageMetrics::infinity,
.bytesReadPerKSecond = StorageMetrics::infinity }
};
}
struct DDAudit {
DDAudit(UID id, KeyRange range, AuditType type)
: id(id), range(range), type(type), auditMap(AuditPhase::Invalid, allKeys.end), actors(true) {}
@ -76,7 +90,7 @@ void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int pr
return;
}
ASSERT(this->meta.range.contains(range));
ASSERT(!this->meta.ranges.empty() && this->meta.ranges.front().contains(range));
if (!shard.hasDest) {
TraceEvent(SevError, "DataMoveValidationError")
@ -480,17 +494,21 @@ public:
for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
const DataMoveMetaData& meta = it.value()->meta;
if (meta.ranges.empty()) {
TraceEvent(SevWarnAlways, "EmptyDataMoveRange", self->ddId).detail("DataMoveMetaData", meta.toString());
continue;
}
if (it.value()->isCancelled() || (it.value()->valid && !SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
RelocateShard rs(meta.ranges.front(), DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.cancelled = true;
self->relocationProducer.send(rs);
TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
} else if (it.value()->valid) {
TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
ASSERT(meta.range == it.range());
ASSERT(meta.ranges.front() == it.range());
// TODO: Persist priority in DataMoveMetaData.
RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
RelocateShard rs(meta.ranges.front(), DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.dataMove = it.value();
std::vector<ShardsAffectedByTeamFailure::Team> teams;

View File

@ -1662,3 +1662,43 @@ IDiskQueue* openDiskQueue(std::string basename,
int64_t fileSizeWarningLimit) {
return new DiskQueue_PopUncommitted(basename, ext, dbgid, dqv, fileSizeWarningLimit);
}
TEST_CASE("performance/fdbserver/DiskQueue") {
state IDiskQueue* queue =
openDiskQueue("test-", "fdq", deterministicRandom()->randomUniqueID(), DiskQueueVersion::V2);
state std::string valueString = std::string(10e6, '.');
state StringRef valueStr((uint8_t*)valueString.c_str(), 10e6);
state std::deque<IDiskQueue::location> locations;
state int loopCount = 0;
state Future<Void> lastCommit = Void();
bool fullyRecovered = wait(queue->initializeRecovery(0));
if (!fullyRecovered) {
loop {
Standalone<StringRef> h = wait(queue->readNext(1e6));
if (h.size() < 1e6) {
break;
}
}
}
while (loopCount < 4000) {
if (loopCount % 100 == 0) {
printf("loop count: %d\n", loopCount);
}
if (++loopCount % 2 == 0) {
state IDiskQueue::location frontLocation = locations.front();
locations.pop_front();
if (locations.size() > 10) {
Standalone<StringRef> r = wait(queue->read(frontLocation, locations.front(), CheckHashes::True));
}
queue->pop(frontLocation);
}
wait(delay(0.001));
locations.push_back(queue->push(valueStr));
Future<Void> prevCommit = lastCommit;
lastCommit = queue->commit();
wait(prevCommit);
}
queue->dispose();
wait(queue->onClosed());
return Void();
}

View File

@ -625,7 +625,7 @@ bool isBlobMetadataEligibleForRefresh(const BlobMetadataDetailsRef& blobMetadata
if (BUGGIFY_WITH_PROB(0.01)) {
return true;
}
int64_t nextRefreshCycleTS = currTS + SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
int64_t nextRefreshCycleTS = currTS + CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
return nextRefreshCycleTS > blobMetadata.expireAt || nextRefreshCycleTS > blobMetadata.refreshAt;
}
@ -895,7 +895,7 @@ ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface,
TaskPriority::Worker);
self->blobMetadataRefresher = recurring([&]() { refreshBlobMetadata(self, kmsConnectorInf); },
SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL,
CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL,
TaskPriority::Worker);
try {

View File

@ -170,7 +170,8 @@ ACTOR Future<int> spawnSimulated(std::vector<std::string> paramList,
ProcessClass(ProcessClass::UnsetClass, ProcessClass::AutoSource),
self->dataFolder.c_str(),
self->coordinationFolder.c_str(), // do we need to customize this coordination folder path?
self->protocolVersion);
self->protocolVersion,
false);
wait(g_simulator->onProcess(child));
state Future<ISimulator::KillType> onShutdown = child->onShutdown();
state Future<ISimulator::KillType> parentShutdown = self->onShutdown();

View File

@ -202,7 +202,8 @@ class GlobalTagThrottlerImpl {
for (const auto& [id, _] : throughput) {
result += getCurrentCost(id, tag).orDefault(0);
}
TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);
// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
//TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);
return result;
}
@ -235,10 +236,13 @@ class GlobalTagThrottlerImpl {
return 1.0;
}
auto const transactionRate = stats.get().getTransactionRate();
// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
/*
TraceEvent("GlobalTagThrottler_GetAverageTransactionCost")
.detail("Tag", tag)
.detail("TransactionRate", transactionRate)
.detail("Cost", cost);
*/
if (transactionRate == 0.0) {
return 1.0;
} else {

View File

@ -154,7 +154,7 @@ struct GrvProxyStats {
return int64_t(100 * this->percentageOfBatchGRVQueueProcessed);
});
logger = traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "GrvProxyMetrics");
logger = cc.traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "GrvProxyMetrics");
for (int i = 0; i < FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS; i++) {
requestBuckets.push_back(0);
}
@ -459,9 +459,9 @@ void dropRequestFromQueue(Deque<GetReadVersionRequest>* queue, GrvProxyStats* st
// Put a GetReadVersion request into the queue corresponding to its priority.
ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo> const> db,
SpannedDeque<GetReadVersionRequest>* systemQueue,
SpannedDeque<GetReadVersionRequest>* defaultQueue,
SpannedDeque<GetReadVersionRequest>* batchQueue,
Deque<GetReadVersionRequest>* systemQueue,
Deque<GetReadVersionRequest>* defaultQueue,
Deque<GetReadVersionRequest>* batchQueue,
FutureStream<GetReadVersionRequest> readVersionRequests,
PromiseStream<Void> GRVTimer,
double* lastGRVTime,
@ -531,7 +531,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
stats->txnSystemPriorityStartIn += req.transactionCount;
++stats->systemGRVQueueSize;
systemQueue->push_back(req);
// systemQueue->span.addParent(req.spanContext);
} else if (req.priority >= TransactionPriority::DEFAULT) {
++stats->txnRequestIn;
stats->txnStartIn += req.transactionCount;
@ -542,7 +541,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
} else {
defaultQueue->push_back(req);
}
// defaultQueue->span.addParent(req.spanContext);
} else {
// Return error for batch_priority GRV requests
int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1);
@ -559,7 +557,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
} else {
batchQueue->push_back(req);
}
// batchQueue->span.addParent(req.spanContext);
}
}
}
@ -607,7 +604,7 @@ ACTOR Future<Void> lastCommitUpdater(GrvProxyData* self, PromiseStream<Future<Vo
}
}
ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan,
ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(std::vector<SpanContext> spanContexts,
GrvProxyData* grvProxyData,
uint32_t flags,
Optional<UID> debugID,
@ -620,7 +617,10 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan
// before the request returns, so it is committed. (2) No proxy on our list reported committed a higher version
// before this request was received, because then its committedVersion would have been higher,
// and no other proxy could have already committed anything without first ending the epoch
state Span span("GP:getLiveCommittedVersion"_loc, parentSpan);
state Span span("GP:getLiveCommittedVersion"_loc);
for (const SpanContext& spanContext : spanContexts) {
span.addLink(spanContext);
}
++grvProxyData->stats.txnStartBatch;
state double grvStart = now();
@ -826,15 +826,14 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
state GrvTransactionRateInfo batchRateInfo(0);
state GrvProxyTransactionTagThrottler tagThrottler;
state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc);
state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc);
state SpannedDeque<GetReadVersionRequest> batchQueue("GP:transactionStarterBatchQueue"_loc);
state Deque<GetReadVersionRequest> systemQueue;
state Deque<GetReadVersionRequest> defaultQueue;
state Deque<GetReadVersionRequest> batchQueue;
state TransactionTagMap<uint64_t> transactionTagCounter;
state PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientThrottledTags;
state PromiseStream<double> normalGRVLatency;
// state Span span;
state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
getCurrentLineage()->modify(&TransactionLineage::operation) =
@ -911,7 +910,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
uint32_t defaultQueueSize = defaultQueue.size();
uint32_t batchQueueSize = batchQueue.size();
while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) {
SpannedDeque<GetReadVersionRequest>* transactionQueue;
Deque<GetReadVersionRequest>* transactionQueue;
if (!systemQueue.empty()) {
transactionQueue = &systemQueue;
} else if (!defaultQueue.empty()) {
@ -921,7 +920,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
} else {
break;
}
// transactionQueue->span.swap(span);
auto& req = transactionQueue->front();
int tc = req.transactionCount;
@ -1017,7 +1015,13 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
int batchGRVProcessed = 0;
for (int i = 0; i < start.size(); i++) {
if (start[i].size()) {
Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(SpanContext(),
std::vector<SpanContext> spanContexts;
spanContexts.reserve(start[i].size());
for (const GetReadVersionRequest& request : start[i]) {
spanContexts.push_back(request.spanContext);
}
Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(spanContexts,
grvProxyData,
i,
debugID,
@ -1041,7 +1045,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
batchGRVProcessed += batchPriTransactionsStarted[i];
}
}
// span = Span(span.location);
grvProxyData->stats.percentageOfDefaultGRVQueueProcessed =
defaultQueueSize ? (double)defaultGRVProcessed / defaultQueueSize : 1;

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbclient/Knobs.h"
#include "fdbserver/GrvProxyTransactionTagThrottler.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // must be last include
@ -28,6 +29,10 @@ void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDur
req.proxyTagThrottledDuration = now() - startTime;
}
bool GrvProxyTransactionTagThrottler::DelayedRequest::isMaxThrottled() const {
return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION;
}
void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
if (rateInfo.present()) {
rateInfo.get().setRate(rate);
@ -36,6 +41,20 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
}
}
bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const {
return !requests.empty() && requests.front().isMaxThrottled();
}
void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() {
CODE_PROBE(true, "GrvProxyTransactionTagThrottler rejecting requests");
while (!requests.empty()) {
auto& delayedReq = requests.front();
delayedReq.updateProxyTagThrottledDuration();
delayedReq.req.reply.sendError(proxy_tag_throttled());
requests.pop_front();
}
}
void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
for (const auto& [tag, rate] : newRates) {
auto it = queues.find(tag);
@ -73,6 +92,7 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
// unexpected behaviour, because only one tag is used for throttling.
TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags")
.suppressFor(1.0)
.detail("NumTags", req.tags.size())
.detail("UsingTag", printable(tag));
}
@ -80,8 +100,8 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
}
void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
SpannedDeque<GetReadVersionRequest>& outBatchPriority,
SpannedDeque<GetReadVersionRequest>& outDefaultPriority) {
Deque<GetReadVersionRequest>& outBatchPriority,
Deque<GetReadVersionRequest>& outDefaultPriority) {
// Pointer to a TagQueue with some extra metadata stored alongside
struct TagQueueHandle {
// Store pointers here to avoid frequent std::unordered_map lookups
@ -140,6 +160,11 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
// Cannot release any more transaction from this tag (don't push the tag queue handle back into
// pqOfQueues)
CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction");
if (tagQueueHandle.queue->isMaxThrottled()) {
// Requests in this queue have been throttled too long and errors
// should be sent to clients.
tagQueueHandle.queue->rejectRequests();
}
break;
} else {
if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) {
@ -255,8 +280,8 @@ ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* thrott
}
ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler) {
state SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
state SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
state Deque<GetReadVersionRequest> outBatchPriority;
state Deque<GetReadVersionRequest> outDefaultPriority;
loop {
state double elapsed = (0.009 + 0.002 * deterministicRandom()->random01());
wait(delay(elapsed));
@ -379,8 +404,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {
throttler.updateRates(TransactionTagMap<double>{});
ASSERT_EQ(throttler.size(), 1);
{
SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
Deque<GetReadVersionRequest> outBatchPriority;
Deque<GetReadVersionRequest> outDefaultPriority;
throttler.releaseTransactions(0.1, outBatchPriority, outDefaultPriority);
}
// Calling updates cleans up the queues in throttler

View File

@ -347,8 +347,8 @@ public:
Randomize::False,
g_network->isSimulated() ? IsSimulated::True : IsSimulated::False);
}
logger = traceCounters(
"LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LocalConfigurationMetrics");
logger = cc.traceCounters(
"LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "LocalConfigurationMetrics");
}
Future<Void> addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> changes,

View File

@ -190,15 +190,14 @@ struct LogRouterData {
});
specialCounter(cc, "Generation", [this]() { return this->generation; });
specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; });
logger = traceCounters("LogRouterMetrics",
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
&cc,
"LogRouterMetrics",
[this](TraceEvent& te) {
te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
te.detail("RouterTag", this->routerTag.toString());
});
logger = cc.traceCounters("LogRouterMetrics",
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
"LogRouterMetrics",
[this](TraceEvent& te) {
te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
te.detail("RouterTag", this->routerTag.toString());
});
}
};

View File

@ -0,0 +1,623 @@
/*
* MockGlobalState.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/MockGlobalState.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/DataDistribution.actor.h"
#include "flow/actorcompiler.h"
class MockGlobalStateImpl {
public:
ACTOR static Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(MockGlobalState* mgs,
KeyRange keys,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError,
int shardLimit,
int expectedShardCount) {
state TenantInfo tenantInfo;
loop {
auto locations = mgs->getKeyRangeLocations(tenantInfo,
keys,
shardLimit,
Reverse::False,
SpanContext(),
Optional<UID>(),
UseProvisionalProxies::False,
0)
.get();
TraceEvent(SevDebug, "MGSWaitStorageMetrics").detail("Phase", "GetLocation");
// NOTE(xwang): in native API, there's code handling the non-equal situation, but I think in mock world
// there shouldn't have any delay to update the locations.
ASSERT_EQ(expectedShardCount, locations.size());
Optional<StorageMetrics> res =
wait(::waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
// SOMEDAY: reuse the NativeAPI implementation
ACTOR static Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(MockGlobalState* mgs,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated,
Optional<int> minSplitBytes) {
state TenantInfo tenantInfo;
loop {
state std::vector<KeyRangeLocationInfo> locations =
mgs->getKeyRangeLocations(tenantInfo,
keys,
CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT,
Reverse::False,
SpanContext(),
Optional<UID>(),
UseProvisionalProxies::False,
0)
.get();
// Same solution to NativeAPI::splitStorageMetrics, wait some merge finished
if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
}
Optional<Standalone<VectorRef<KeyRef>>> results =
wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
if (results.present()) {
return results.get();
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
};
class MockStorageServerImpl {
public:
ACTOR static Future<Void> waitMetricsTenantAware(MockStorageServer* self, WaitMetricsRequest req) {
if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
// TODO(xwang) add support for tenant test, search for tenant entry
Optional<TenantMapEntry> entry;
Optional<Key> tenantPrefix = entry.map<Key>([](TenantMapEntry e) { return e.prefix; });
if (tenantPrefix.present()) {
UNREACHABLE();
// req.keys = req.keys.withPrefix(tenantPrefix.get(), req.arena);
}
}
if (!self->isReadable(req.keys)) {
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
wait(self->metrics.waitMetrics(req, delayJittered(SERVER_KNOBS->STORAGE_METRIC_TIMEOUT)));
}
return Void();
}
};
bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty()); // at least the range is allKeys
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
if (it->cvalue().status != status)
return false;
}
return true;
}
void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
// now the boundary must be aligned
ASSERT(ranges.begin().begin() == range.begin);
ASSERT(ranges.end().end() == range.end);
uint64_t newSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
newSize += it->cvalue().shardSize;
}
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
auto oldStatus = it.value().status;
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
.detail("To", status)
.detail("ID", id)
.detail("KeyBegin", range.begin.toHexString())
.detail("KeyEnd", range.begin.toHexString());
}
}
serverKeys.coalesce(range);
}
// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
// size of the new shards are randomly split from old size of [a, d)
void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
uint64_t outerRangeSize,
bool restrictSize) {
ASSERT(outerRange.contains(innerRange));
Key left = outerRange.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int midSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? outerRangeSize - leftSize - midSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
serverKeys[left].shardSize = leftSize;
serverKeys[innerRange.end].shardSize = rightSize;
}
// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
// size of the new shards are randomly split from old size of [a, c)
void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
KeyRef splitPoint,
uint64_t rangeSize,
bool restrictSize) {
Key left = range.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
: SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? rangeSize - leftSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
serverKeys[left].shardSize = leftSize;
}
void MockStorageServer::removeShard(KeyRangeRef range) {
auto ranges = serverKeys.containedRanges(range);
ASSERT(ranges.begin().range() == range);
serverKeys.rawErase(range);
}
uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
auto ranges = serverKeys.intersectingRanges(range);
uint64_t totalSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
totalSize += it->cvalue().shardSize;
}
return totalSize;
}
void MockStorageServer::addActor(Future<Void> future) {
actors.add(future);
}
void MockStorageServer::getSplitPoints(const SplitRangeRequest& req) {}
Future<Void> MockStorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) {
return MockStorageServerImpl::waitMetricsTenantAware(this, req);
}
void MockStorageServer::getStorageMetrics(const GetStorageMetricsRequest& req) {}
Future<Void> MockStorageServer::run() {
ssi.locality = LocalityData(Optional<Standalone<StringRef>>(),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Optional<Standalone<StringRef>>());
ssi.initEndpoints();
ssi.startAcceptingRequests();
TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
return serveStorageMetricsRequests(this, ssi);
}
void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
ASSERT(conf.storageTeamSize > 0);
configuration = conf;
std::vector<UID> serverIds;
for (int i = 1; i <= conf.storageTeamSize; ++i) {
UID id = indexToUID(i);
serverIds.push_back(id);
allServers[id] = MockStorageServer(id, defaultDiskSpace);
allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
}
shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
}
void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
allServers[server.id()] = MockStorageServer(server, diskSpace);
}
bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsForFirstShard(shard);
if (inFlightShard) {
return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
return std::any_of(
teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
}
bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsForFirstShard(shard);
return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
bool MockGlobalState::allShardsRemovedFromServer(const UID& serverId) {
return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
}
Future<std::pair<Optional<StorageMetrics>, int>> MockGlobalState::waitStorageMetrics(
const KeyRange& keys,
const StorageMetrics& min,
const StorageMetrics& max,
const StorageMetrics& permittedError,
int shardLimit,
int expectedShardCount) {
return MockGlobalStateImpl::waitStorageMetrics(
this, keys, min, max, permittedError, shardLimit, expectedShardCount);
}
Reference<LocationInfo> buildLocationInfo(const std::vector<StorageServerInterface>& interfaces) {
// construct the location info with the servers
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> serverRefs;
serverRefs.reserve(interfaces.size());
for (const auto& interf : interfaces) {
serverRefs.push_back(makeReference<ReferencedInterface<StorageServerInterface>>(interf));
}
return makeReference<LocationInfo>(serverRefs);
}
Future<KeyRangeLocationInfo> MockGlobalState::getKeyLocation(TenantInfo tenant,
Key key,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Reverse isBackward,
Version version) {
if (isBackward) {
// DD never ask for backward range.
UNREACHABLE();
}
ASSERT(key < allKeys.end);
GetKeyServerLocationsReply rep;
KeyRange single = singleKeyRange(key);
auto teamPair = shardMapping->getTeamsForFirstShard(single);
auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
ASSERT_EQ(srcTeam.size(), 1);
rep.results.emplace_back(single, extractStorageServerInterfaces(srcTeam.front().servers));
return KeyRangeLocationInfo(
rep.tenantEntry,
KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
buildLocationInfo(rep.results[0].second));
}
Future<std::vector<KeyRangeLocationInfo>> MockGlobalState::getKeyRangeLocations(
TenantInfo tenant,
KeyRange keys,
int limit,
Reverse reverse,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Version version) {
if (reverse) {
// DD never ask for backward range.
ASSERT(false);
}
ASSERT(keys.begin < keys.end);
GetKeyServerLocationsReply rep;
auto ranges = shardMapping->intersectingRanges(keys);
auto it = ranges.begin();
for (int count = 0; it != ranges.end() && count < limit; ++it, ++count) {
auto teamPair = shardMapping->getTeamsFor(it->begin());
auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
ASSERT_EQ(srcTeam.size(), 1);
rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers));
}
CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare);
std::vector<KeyRangeLocationInfo> results;
for (int shard = 0; shard < rep.results.size(); shard++) {
results.emplace_back(rep.tenantEntry,
(toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
buildLocationInfo(rep.results[shard].second));
}
return results;
}
std::vector<StorageServerInterface> MockGlobalState::extractStorageServerInterfaces(const std::vector<UID>& ids) const {
std::vector<StorageServerInterface> interfaces;
for (auto& id : ids) {
interfaces.emplace_back(allServers.at(id).ssi);
}
return interfaces;
}
Future<Standalone<VectorRef<KeyRef>>> MockGlobalState::splitStorageMetrics(const KeyRange& keys,
const StorageMetrics& limit,
const StorageMetrics& estimated,
const Optional<int>& minSplitBytes) {
return MockGlobalStateImpl::splitStorageMetrics(this, keys, limit, estimated, minSplitBytes);
}
TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 3;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
auto id = MockGlobalState::indexToUID(i);
std::cout << "Check server " << i << "\n";
ASSERT(mgs->serverIsSourceForShard(id, allKeys));
ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
}
return Void();
}
struct MockGlobalStateTester {
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
void testThreeWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.ranges().begin();
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
MockShardStatus oldStatus = it.cvalue().status;
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
Key x2 = keyAfter(x1);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
ranges.pop_front();
ASSERT(ranges.empty());
}
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
void testTwoWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.nthRange(0);
MockShardStatus oldStatus = it.cvalue().status;
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.empty());
}
KeyRangeLocationInfo getKeyLocationInfo(KeyRef key, std::shared_ptr<MockGlobalState> mgs) {
return mgs
->getKeyLocation(
TenantInfo(), key, SpanContext(), Optional<UID>(), UseProvisionalProxies::False, Reverse::False, 0)
.get();
}
std::vector<KeyRangeLocationInfo> getKeyRangeLocations(KeyRangeRef keys,
int limit,
std::shared_ptr<MockGlobalState> mgs) {
return mgs
->getKeyRangeLocations(TenantInfo(),
keys,
limit,
Reverse::False,
SpanContext(),
Optional<UID>(),
UseProvisionalProxies::False,
0)
.get();
}
};
TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
MockGlobalStateTester tester;
auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
std::cout << "Test 3-way splitting...\n";
tester.testThreeWaySplitFirstRange(mss);
std::cout << "Test 2-way splitting...\n";
mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
tester.testTwoWaySplitFirstRange(mss);
return Void();
}
namespace {
inline bool locationInfoEqualsToTeam(Reference<LocationInfo> loc, const std::vector<UID>& ids) {
return loc->locations()->size() == ids.size() &&
std::all_of(ids.begin(), ids.end(), [loc](const UID& id) { return loc->locations()->hasInterface(id); });
}
}; // namespace
TEST_CASE("/MockGlobalState/MockStorageServer/GetKeyLocations") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
// add one empty server
mgs->addStorageServer(StorageServerInterface(mgs->indexToUID(mgs->allServers.size() + 1)));
// define 3 ranges:
// team 1 (UID 1,2,...,n-1):[begin, 1.0), [2.0, end)
// team 2 (UID 2,3,...n-1, n): [1.0, 2.0)
ShardsAffectedByTeamFailure::Team team1, team2;
for (int i = 0; i < mgs->allServers.size() - 1; ++i) {
UID id = mgs->indexToUID(i + 1);
team1.servers.emplace_back(id);
id = mgs->indexToUID(i + 2);
team2.servers.emplace_back(id);
}
Key one = doubleToTestKey(1.0), two = doubleToTestKey(2.0);
std::vector<KeyRangeRef> ranges{ KeyRangeRef(allKeys.begin, one),
KeyRangeRef(one, two),
KeyRangeRef(two, allKeys.end) };
mgs->shardMapping->assignRangeToTeams(ranges[0], { team1 });
mgs->shardMapping->assignRangeToTeams(ranges[1], { team2 });
mgs->shardMapping->assignRangeToTeams(ranges[2], { team1 });
// query key location
MockGlobalStateTester tester;
// -- team 1
Key testKey = doubleToTestKey(0.5);
auto locInfo = tester.getKeyLocationInfo(testKey, mgs);
ASSERT(locationInfoEqualsToTeam(locInfo.locations, team1.servers));
// -- team 2
testKey = doubleToTestKey(1.3);
locInfo = tester.getKeyLocationInfo(testKey, mgs);
ASSERT(locationInfoEqualsToTeam(locInfo.locations, team2.servers));
// query range location
testKey = doubleToTestKey(3.0);
// team 1,2,1
auto locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 100, mgs);
ASSERT(locInfos.size() == 3);
ASSERT(locInfos[0].range == ranges[0]);
ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
ASSERT(locInfos[1].range == ranges[1]);
ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
ASSERT(locInfos[2].range == KeyRangeRef(ranges[2].begin, testKey));
ASSERT(locationInfoEqualsToTeam(locInfos[2].locations, team1.servers));
// team 1,2
locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 2, mgs);
ASSERT(locInfos.size() == 2);
ASSERT(locInfos[0].range == ranges[0]);
ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
ASSERT(locInfos[1].range == ranges[1]);
ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
return Void();
}
TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("WaitStorageMetricsRequestUnitTestConfig").detail("Config", dbConfig.toString());
state std::shared_ptr<MockGlobalState> mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
state ActorCollection actors;
ActorCollection* ptr = &actors; // get around ACTOR syntax restriction
std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) {
ptr->add(server.second.run());
IFailureMonitor::failureMonitor().setStatus(server.second.ssi.address(), FailureStatus(false));
server.second.metrics.byteSample.sample.insert("something"_sr, 500000);
});
KeyRange testRange = allKeys;
ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
std::pair<Optional<StorageMetrics>, int> res =
wait(mgs->waitStorageMetrics(testRange, bounds.min, bounds.max, bounds.permittedError, 1, 1));
// std::cout << "get result " << res.second << "\n";
// std::cout << "get byte "<< res.first.get().bytes << "\n";
ASSERT_EQ(res.second, -1); // the valid result always return -1, strange contraction though.
ASSERT_EQ(res.first.get().bytes, 500000);
return Void();
}

View File

@ -1,281 +0,0 @@
/*
* MockGlobalState.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/MockGlobalState.h"
bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty()); // at least the range is allKeys
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
if (it->cvalue().status != status)
return false;
}
return true;
}
void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
// now the boundary must be aligned
ASSERT(ranges.begin().begin() == range.begin);
ASSERT(ranges.end().end() == range.end);
uint64_t newSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
newSize += it->cvalue().shardSize;
}
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
auto oldStatus = it.value().status;
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
.detail("To", status)
.detail("ID", id)
.detail("KeyBegin", range.begin.toHexString())
.detail("KeyEnd", range.begin.toHexString());
}
}
serverKeys.coalesce(range);
}
// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
// size of the new shards are randomly split from old size of [a, d)
void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
uint64_t outerRangeSize,
bool restrictSize) {
ASSERT(outerRange.contains(innerRange));
Key left = outerRange.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int midSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? outerRangeSize - leftSize - midSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
serverKeys[left].shardSize = leftSize;
serverKeys[innerRange.end].shardSize = rightSize;
}
// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
// size of the new shards are randomly split from old size of [a, c)
void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
KeyRef splitPoint,
uint64_t rangeSize,
bool restrictSize) {
Key left = range.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
: SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? rangeSize - leftSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
serverKeys[left].shardSize = leftSize;
}
void MockStorageServer::removeShard(KeyRangeRef range) {
auto ranges = serverKeys.containedRanges(range);
ASSERT(ranges.begin().range() == range);
serverKeys.rawErase(range);
}
uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
auto ranges = serverKeys.intersectingRanges(range);
uint64_t totalSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
totalSize += it->cvalue().shardSize;
}
return totalSize;
}
void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
ASSERT(conf.storageTeamSize > 0);
configuration = conf;
std::vector<UID> serverIds;
for (int i = 1; i <= conf.storageTeamSize; ++i) {
UID id = indexToUID(i);
serverIds.push_back(id);
allServers[id] = MockStorageServer(id, defaultDiskSpace);
allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
}
shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
}
void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
allServers[server.id()] = MockStorageServer(server, diskSpace);
}
bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsFor(shard);
if (inFlightShard) {
return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
return std::any_of(
teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
}
bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsFor(shard);
return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
bool MockGlobalState::allShardRemovedFromServer(const UID& serverId) {
return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
}
TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 3;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
auto id = MockGlobalState::indexToUID(i);
std::cout << "Check server " << i << "\n";
ASSERT(mgs->serverIsSourceForShard(id, allKeys));
ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
}
return Void();
}
struct MockGlobalStateTester {
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
void testThreeWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.ranges().begin();
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
MockShardStatus oldStatus = it.cvalue().status;
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
Key x2 = keyAfter(x1);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
ranges.pop_front();
ASSERT(ranges.empty());
}
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
void testTwoWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.nthRange(0);
MockShardStatus oldStatus = it.cvalue().status;
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.empty());
}
};
TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
MockGlobalStateTester tester;
auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
std::cout << "Test 3-way splitting...\n";
tester.testThreeWaySplitFirstRange(mss);
std::cout << "Test 2-way splitting...\n";
mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
tester.testTwoWaySplitFirstRange(mss);
return Void();
}

View File

@ -1287,7 +1287,7 @@ ACTOR static Future<Void> startMoveShards(Database occ,
TraceEvent(SevVerbose, "StartMoveShardsFoundDataMove", relocationIntervalId)
.detail("DataMoveID", dataMoveId)
.detail("DataMove", dataMove.toString());
ASSERT(dataMove.range.begin == keys.begin);
ASSERT(!dataMove.ranges.empty() && dataMove.ranges.front().begin == keys.begin);
if (dataMove.getPhase() == DataMoveMetaData::Deleting) {
TraceEvent(SevVerbose, "StartMoveShardsDataMove", relocationIntervalId)
.detail("DataMoveBeingDeleted", dataMoveId);
@ -1296,10 +1296,10 @@ ACTOR static Future<Void> startMoveShards(Database occ,
if (dataMove.getPhase() == DataMoveMetaData::Running) {
TraceEvent(SevVerbose, "StartMoveShardsDataMove", relocationIntervalId)
.detail("DataMoveAlreadyCommitted", dataMoveId);
ASSERT(keys == dataMove.range);
ASSERT(keys == dataMove.ranges.front());
return Void();
}
begin = dataMove.range.end;
begin = dataMove.ranges.front().end;
} else {
dataMove.id = dataMoveId;
TraceEvent(SevVerbose, "StartMoveKeysNewDataMove", relocationIntervalId)
@ -1441,7 +1441,8 @@ ACTOR static Future<Void> startMoveShards(Database occ,
&tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysValue(dataMoveId)));
}
dataMove.range = KeyRangeRef(keys.begin, currentKeys.end);
dataMove.ranges.clear();
dataMove.ranges.push_back(KeyRangeRef(keys.begin, currentKeys.end));
dataMove.dest.insert(servers.begin(), servers.end());
}
@ -1471,7 +1472,7 @@ ACTOR static Future<Void> startMoveShards(Database occ,
.detail("DataMoveKey", dataMoveKeyFor(dataMoveId))
.detail("CommitVersion", tr.getCommittedVersion())
.detail("DeltaRange", currentKeys.toString())
.detail("Range", dataMove.range.toString())
.detail("Range", describe(dataMove.ranges))
.detail("DataMove", dataMove.toString());
dataMove = DataMoveMetaData();
@ -1628,7 +1629,8 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
throw data_move_cancelled();
}
ASSERT(dataMove.getPhase() == DataMoveMetaData::Running);
range = dataMove.range;
ASSERT(!dataMove.ranges.empty());
range = dataMove.ranges.front();
} else {
TraceEvent(SevWarn, "FinishMoveShardsDataMoveDeleted", relocationIntervalId)
.detail("DataMoveID", dataMoveId);
@ -1766,7 +1768,7 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
wait(waitForAll(actors));
if (range.end == dataMove.range.end) {
if (range.end == dataMove.ranges.front().end) {
tr.clear(dataMoveKeyFor(dataMoveId));
complete = true;
TraceEvent(SevVerbose, "FinishMoveShardsDeleteMetaData", dataMoveId)
@ -1776,7 +1778,7 @@ ACTOR static Future<Void> finishMoveShards(Database occ,
.detail("DataMoveID", dataMoveId)
.detail("CurrentRange", range)
.detail("NewDataMoveMetaData", dataMove.toString());
dataMove.range = KeyRangeRef(range.end, dataMove.range.end);
dataMove.ranges.front() = KeyRangeRef(range.end, dataMove.ranges.front().end);
tr.set(dataMoveKeyFor(dataMoveId), dataMoveValue(dataMove));
}
@ -2229,9 +2231,10 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
Optional<Value> val = wait(tr.get(dataMoveKeyFor(destId)));
if (val.present()) {
state DataMoveMetaData dataMove = decodeDataMoveValue(val.get());
ASSERT(!dataMove.ranges.empty());
TraceEvent(SevVerbose, "RemoveRangeFoundDataMove", serverID)
.detail("DataMoveMetaData", dataMove.toString());
if (range == dataMove.range) {
if (range == dataMove.ranges.front()) {
tr.clear(dataMoveKeyFor(destId));
} else {
dataMove.setPhase(DataMoveMetaData::Deleting);
@ -2350,10 +2353,11 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
Optional<Value> val = wait(tr.get(dataMoveKeyFor(dataMoveId)));
if (val.present()) {
dataMove = decodeDataMoveValue(val.get());
ASSERT(!dataMove.ranges.empty());
TraceEvent(SevVerbose, "CleanUpDataMoveMetaData", dataMoveId)
.detail("DataMoveID", dataMoveId)
.detail("DataMoveMetaData", dataMove.toString());
range = dataMove.range;
range = dataMove.ranges.front();
ASSERT(!range.empty());
} else {
TraceEvent(SevDebug, "CleanUpDataMoveNotExist", dataMoveId).detail("DataMoveID", dataMoveId);
@ -2419,14 +2423,14 @@ ACTOR Future<Void> cleanUpDataMove(Database occ,
currentShards[i + 1].value);
}
if (range.end == dataMove.range.end) {
if (range.end == dataMove.ranges.front().end) {
tr.clear(dataMoveKeyFor(dataMoveId));
complete = true;
TraceEvent(SevVerbose, "CleanUpDataMoveDeleteMetaData", dataMoveId)
.detail("DataMoveID", dataMove.toString());
} else {
dataMove.range = KeyRangeRef(range.end, dataMove.range.end);
dataMove.ranges.front() = KeyRangeRef(range.end, dataMove.ranges.front().end);
dataMove.setPhase(DataMoveMetaData::Deleting);
tr.set(dataMoveKeyFor(dataMoveId), dataMoveValue(dataMove));
TraceEvent(SevVerbose, "CleanUpDataMovePartial", dataMoveId)

View File

@ -447,10 +447,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
"Restored");
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
@ -1399,26 +1399,26 @@ ACTOR Future<Void> tLogCore(TLogData* self, Reference<LogData> logData) {
logData->addActor.send(waitFailureServer(logData->tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput));

View File

@ -533,10 +533,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
context);
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2212,26 +2212,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));

View File

@ -616,10 +616,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
context);
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2671,26 +2671,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));

View File

@ -35,7 +35,7 @@
#include "fdbserver/ResolverInterface.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
@ -188,7 +188,7 @@ struct Resolver : ReferenceCounted<Resolver> {
specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); });
specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); });
logger = traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ResolverMetrics");
logger = cc.traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ResolverMetrics");
}
~Resolver() { destroyConflictSet(conflictSet); }
};

View File

@ -30,7 +30,7 @@
#include "fdbserver/RestoreLoader.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.

View File

@ -753,6 +753,8 @@ ACTOR Future<CheckpointMetaData> fetchRocksDBCheckpoint(Database cx,
.detail("InitialState", initialState.toString())
.detail("CheckpointDir", dir);
ASSERT(!initialState.ranges.empty());
state std::shared_ptr<CheckpointMetaData> metaData = std::make_shared<CheckpointMetaData>(initialState);
if (metaData->format == RocksDBColumnFamily) {
@ -771,7 +773,7 @@ ACTOR Future<CheckpointMetaData> fetchRocksDBCheckpoint(Database cx,
} else if (metaData->format == RocksDB) {
std::shared_ptr<rocksdb::SstFileWriter> writer =
std::make_shared<rocksdb::SstFileWriter>(rocksdb::EnvOptions(), rocksdb::Options());
wait(fetchCheckpointRange(cx, metaData, metaData->range, dir, writer, cFun));
wait(fetchCheckpointRange(cx, metaData, metaData->ranges.front(), dir, writer, cFun));
}
return *metaData;

View File

@ -40,10 +40,16 @@ int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const {
}
std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) {
ShardsAffectedByTeamFailure::getTeamsForFirstShard(KeyRangeRef keys) {
return shard_teams[keys.begin];
}
std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
ShardsAffectedByTeamFailure::getTeamsFor(KeyRef key) {
return shard_teams[key];
}
void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) {
DisabledTraceEvent(SevDebug, "ShardsAffectedByTeamFailureErase")
.detail("Range", range)
@ -236,3 +242,7 @@ void ShardsAffectedByTeamFailure::removeFailedServerForRange(KeyRangeRef keys, c
}
check();
}
auto ShardsAffectedByTeamFailure::intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges {
return shard_teams.intersectingRanges(keyRange);
}

View File

@ -192,61 +192,6 @@ ACTOR Future<Void> ekLookupByDomainIds(Reference<SimKmsConnectorContext> ctx,
success ? req.reply.send(rep) : req.reply.sendError(encrypt_key_not_found());
return Void();
}
// TODO: switch this to use bg_url instead of hardcoding file://fdbblob, so it works as FDBPerfKmsConnector
// FIXME: make this (more) deterministic outside of simulation for FDBPerfKmsConnector
static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName) {
Standalone<BlobMetadataDetailsRef> metadata;
metadata.domainId = domainId;
metadata.arena().dependsOn(domainName.arena());
metadata.domainName = domainName;
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3);
int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
fmt::print("SimBlobMetadata ({})\n", domainId);
TraceEvent ev(SevDebug, "SimBlobMetadata");
ev.detail("DomainId", domainId).detail("TypeNum", type).detail("PartitionCount", partitionCount);
if (type == 0) {
// single storage location
metadata.base = StringRef(metadata.arena(), "file://fdbblob/" + std::to_string(domainId) + "/");
fmt::print(" {}\n", metadata.base.get().printable());
ev.detail("Base", metadata.base);
}
if (type == 1) {
// simulate hash prefixing in s3
metadata.base = StringRef(metadata.arena(), "file://fdbblob/"_sr);
ev.detail("Base", metadata.base);
fmt::print(" {} ({})\n", metadata.base.get().printable(), partitionCount);
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(metadata.arena(),
deterministicRandom()->randomUniqueID().shortString() + "-" +
std::to_string(domainId) + "/");
fmt::print(" {}\n", metadata.partitions.back().printable());
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
if (type == 2) {
// simulate separate storage location per partition
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(
metadata.arena(), "file://fdbblob" + std::to_string(domainId) + "_" + std::to_string(i) + "/");
fmt::print(" {}\n", metadata.partitions.back().printable());
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
// set random refresh + expire time
if (deterministicRandom()->coinflip()) {
metadata.refreshAt = now() + deterministicRandom()->random01() * SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
metadata.expireAt =
metadata.refreshAt + deterministicRandom()->random01() * SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
} else {
metadata.refreshAt = std::numeric_limits<double>::max();
metadata.expireAt = metadata.refreshAt;
}
return metadata;
}
ACTOR Future<Void> blobMetadataLookup(KmsConnectorInterface interf, KmsConnBlobMetadataReq req) {
state KmsConnBlobMetadataRep rep;
@ -261,7 +206,9 @@ ACTOR Future<Void> blobMetadataLookup(KmsConnectorInterface interf, KmsConnBlobM
if (it == simBlobMetadataStore.end()) {
// construct new blob metadata
it = simBlobMetadataStore
.insert({ domainInfo.domainId, createBlobMetadata(domainInfo.domainId, domainInfo.domainName) })
.insert({ domainInfo.domainId,
createRandomTestBlobMetadata(
SERVER_KNOBS->BG_URL, domainInfo.domainId, domainInfo.domainName) })
.first;
} else if (now() >= it->second.expireAt) {
// update random refresh and expire time

View File

@ -166,8 +166,8 @@ public:
successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
snapshotRequest("SnapshotRequest", cc) {
cfi = getConfigFollowerInterface(configSource);
logger = traceCounters(
"ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigConsumerMetrics");
logger = cc.traceCounters(
"ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigConsumerMetrics");
}
Future<Void> consume(ConfigBroadcaster& broadcaster) {

View File

@ -620,11 +620,13 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
std::string* coordFolder,
std::string baseFolder,
ClusterConnectionString connStr,
ClusterConnectionString otherConnStr,
bool useSeedFile,
AgentMode runBackupAgents,
std::string whitelistBinPaths,
ProtocolVersion protocolVersion,
ConfigDBType configDBType) {
ConfigDBType configDBType,
bool isDr) {
state ISimulator::ProcessInfo* simProcess = g_simulator->getCurrentProcess();
state UID randomId = nondeterministicRandom()->randomUniqueID();
state int cycles = 0;
@ -644,7 +646,8 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
.detail("Address", NetworkAddress(ip, port, true, false))
.detail("ZoneId", localities.zoneId())
.detail("WaitTime", waitTime)
.detail("Port", port);
.detail("Port", port)
.detail("IsDr", isDr);
wait(delay(waitTime));
@ -657,7 +660,8 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
processClass,
dataFolder->c_str(),
coordFolder->c_str(),
protocolVersion);
protocolVersion,
isDr);
wait(g_simulator->onProcess(
process,
TaskPriority::DefaultYield)); // Now switch execution to the process on which we will run
@ -724,6 +728,16 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
}
futures.push_back(success(onShutdown));
if (!g_simulator->globalHasSwitchedCluster() && g_simulator->hasSwitchedCluster(process->address)) {
// When switching machines between clusters, a simultaneous
// reboot followed by a reboot and switch can cause the
// reboot and switch to be ignored. Handle this case by
// sending the reboot and switch kill type when the process
// comes back online.
TraceEvent("RebootProcessAndSwitchLateReboot").detail("Address", process->address);
g_simulator->switchCluster(process->address);
process->shutdownSignal.send(ISimulator::KillType::RebootProcessAndSwitch);
}
wait(waitForAny(futures));
} catch (Error& e) {
// If in simulation, if we make it here with an error other than io_timeout but enASIOTimedOut is set
@ -830,6 +844,24 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
connRecord =
makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), connStr.toString());
}
} else if (onShutdown.get() == ISimulator::RebootProcessAndSwitch) {
TraceEvent("SimulatedFDBDRebootAndSwitch")
.detail("Cycles", cycles)
.detail("RandomId", randomId)
.detail("Address", process->address)
.detail("ZoneId", localities.zoneId())
.detail("KillType", shutdownResult)
.detail("ConnectionString", connStr.toString())
.detail("OtherConnectionString", otherConnStr.toString())
.detail("SwitchingTo", g_simulator->hasSwitchedCluster(process->address))
.detail("MachineId", process->machine->machineId);
// Handle the case where otherConnStr is '@'.
if (otherConnStr.toString().size() > 1) {
std::string newConnStr =
g_simulator->hasSwitchedCluster(process->address) ? otherConnStr.toString() : connStr.toString();
connRecord = makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), newConnStr);
}
} else {
TraceEvent("SimulatedFDBDJustRepeat")
.detail("Cycles", cycles)
@ -846,6 +878,7 @@ std::map<Optional<Standalone<StringRef>>, std::vector<std::vector<std::string>>>
// process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per
// process
ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
ClusterConnectionString otherConnStr,
std::vector<IPAddress> ips,
bool sslEnabled,
LocalityData localities,
@ -857,7 +890,8 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
bool sslOnly,
std::string whitelistBinPaths,
ProtocolVersion protocolVersion,
ConfigDBType configDBType) {
ConfigDBType configDBType,
bool isDr) {
state int bootCount = 0;
state std::vector<std::string> myFolders;
state std::vector<std::string> coordFolders;
@ -924,11 +958,13 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
&coordFolders[i],
baseFolder,
connStr,
otherConnStr,
useSeedFile,
agentMode,
whitelistBinPaths,
protocolVersion,
configDBType));
configDBType,
isDr));
g_simulator->setDiffProtocol = true;
} else {
processes.push_back(simulatedFDBDRebooter(clusterFile,
@ -942,11 +978,13 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
&coordFolders[i],
baseFolder,
connStr,
otherConnStr,
useSeedFile,
agentMode,
whitelistBinPaths,
g_network->protocolVersion(),
configDBType));
configDBType,
isDr));
}
TraceEvent("SimulatedMachineProcess", randomId)
.detail("Address", NetworkAddress(ips[i], listenPort, true, false))
@ -1311,6 +1349,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
// SOMEDAY: parse backup agent from test file
systemActors->push_back(reportErrors(
simulatedMachine(conn,
ClusterConnectionString(),
ipAddrs,
usingSSL,
localities,
@ -1322,7 +1361,8 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
usingSSL && (listenersPerProcess == 1 || processClass == ProcessClass::TesterClass),
whitelistBinPaths,
protocolVersion,
configDBType),
configDBType,
false),
processClass == ProcessClass::TesterClass ? "SimulatedTesterMachine" : "SimulatedMachine"));
}
@ -2346,20 +2386,24 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
// check the sslEnablementMap using only one ip
LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID);
localities.set("data_hall"_sr, dcUID);
systemActors->push_back(reportErrors(simulatedMachine(conn,
ips,
sslEnabled,
localities,
processClass,
baseFolder,
false,
machine == useSeedForMachine,
requiresExtraDBMachines ? AgentOnly : AgentAddition,
sslOnly,
whitelistBinPaths,
protocolVersion,
configDBType),
"SimulatedMachine"));
systemActors->push_back(reportErrors(
simulatedMachine(conn,
requiresExtraDBMachines ? ClusterConnectionString(g_simulator->extraDatabases.at(0))
: ClusterConnectionString(),
ips,
sslEnabled,
localities,
processClass,
baseFolder,
false,
machine == useSeedForMachine,
requiresExtraDBMachines ? AgentOnly : AgentAddition,
sslOnly,
whitelistBinPaths,
protocolVersion,
configDBType,
false),
"SimulatedMachine"));
if (requiresExtraDBMachines) {
int cluster = 4;
@ -2376,6 +2420,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
localities.set("data_hall"_sr, dcUID);
systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase),
conn,
extraIps,
sslEnabled,
localities,
@ -2387,7 +2432,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
sslOnly,
whitelistBinPaths,
protocolVersion,
configDBType),
configDBType,
true),
"SimulatedMachine"));
++cluster;
}
@ -2422,6 +2468,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
Optional<Standalone<StringRef>>(), newZoneId, newZoneId, Optional<Standalone<StringRef>>());
systemActors->push_back(
reportErrors(simulatedMachine(conn,
ClusterConnectionString(),
ips,
sslEnabled,
localities,
@ -2433,7 +2480,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
sslOnly,
whitelistBinPaths,
protocolVersion,
configDBType),
configDBType,
false),
"SimulatedTesterMachine"));
}
@ -2557,7 +2605,8 @@ ACTOR void setupAndRun(std::string dataFolder,
ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource),
"",
"",
currentProtocolVersion());
currentProtocolVersion(),
false);
testSystem->excludeFromRestarts = true;
wait(g_simulator->onProcess(testSystem, TaskPriority::DefaultYield));
Sim2FileSystem::newFileSystem();

View File

@ -248,9 +248,9 @@ public:
lastTLogVersion(0), lastVersionWithData(0), peekVersion(0), compactionInProgress(Void()),
fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), debug_inApplyUpdate(false),
debug_lastValidateTime(0), versionLag(0), behind(false), counters(this) {
version.initMetric("StorageCacheData.Version"_sr, counters.cc.id);
desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.id);
oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.id);
version.initMetric("StorageCacheData.Version"_sr, counters.cc.getId());
desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.getId());
newestAvailableVersion.insert(allKeys, invalidVersion);
newestDirtyVersion.insert(allKeys, invalidVersion);
@ -1188,7 +1188,7 @@ ACTOR Future<RangeResult> tryFetchRange(Database cx,
state RangeResult output;
state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
state KeySelectorRef end = firstGreaterOrEqual(keys.end);
state ReadOptions options = ReadOptions(Optional<UID>(), ReadType::FETCH);
state ReadOptions options = ReadOptions(ReadType::FETCH, CacheResult::False);
if (*isTooOld)
throw transaction_too_old();
@ -2224,11 +2224,10 @@ ACTOR Future<Void> storageCacheServer(StorageServerInterface ssi,
self.ck = cacheKeysPrefixFor(id).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/
actors.add(waitFailureServer(ssi.waitFailure.getFuture()));
actors.add(traceCounters("CacheMetrics",
self.thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&self.counters.cc,
self.thisServerID.toString() + "/CacheMetrics"));
actors.add(self.counters.cc.traceCounters("CacheMetrics",
self.thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
self.thisServerID.toString() + "/CacheMetrics"));
// fetch already cached ranges from the database and apply them before proceeding
wait(storageCacheStartUpWarmup(&self));

View File

@ -19,7 +19,7 @@
*/
#include "flow/UnitTest.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
int64_t StorageMetricSample::getEstimate(KeyRangeRef keys) const {

View File

@ -26,7 +26,6 @@
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/SpanContextMessage.h"
#include "fdbserver/TLogInterface.h"
@ -217,8 +216,6 @@ static const KeyRange persistTagMessagesKeys = prefixRange("TagMsg/"_sr);
static const KeyRange persistTagMessageRefsKeys = prefixRange("TagMsgRef/"_sr);
static const KeyRange persistTagPoppedKeys = prefixRange("TagPop/"_sr);
static const KeyRef persistClusterIdKey = "clusterId"_sr;
static Key persistTagMessagesKey(UID id, Tag tag, Version version) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(persistTagMessagesKeys.begin);
@ -306,13 +303,6 @@ struct TLogData : NonCopyable {
Deque<UID> spillOrder;
std::map<UID, Reference<struct LogData>> id_data;
// The durable cluster ID identifies which cluster the tlogs persistent
// data is written from. This value is restored from disk when the tlog
// restarts.
UID durableClusterId;
// The cluster-controller cluster ID stores the cluster ID read from the txnStateStore.
// It is cached in this variable.
UID ccClusterId;
UID dbgid;
UID workerID;
@ -652,10 +642,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
context);
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2401,24 +2391,6 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
return Void();
}
ACTOR Future<UID> getClusterId(TLogData* self) {
state ReadYourWritesTransaction tr(self->cx);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> clusterId = wait(tr.get(clusterIdKey));
if (clusterId.present()) {
return BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
} else {
return UID();
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// send stopped promise instead of LogData* to avoid reference cycles
ACTOR Future<Void> rejoinClusterController(TLogData* self,
TLogInterface tli,
@ -2441,26 +2413,14 @@ ACTOR Future<Void> rejoinClusterController(TLogData* self,
}
isDisplaced = isDisplaced && !inf.logSystemConfig.hasTLog(tli.id());
if (isDisplaced) {
state TraceEvent ev("TLogDisplaced", tli.id());
ev.detail("Reason", "DBInfoDoesNotContain")
TraceEvent("TLogDisplaced", tli.id())
.detail("Reason", "DBInfoDoesNotContain")
.detail("RecoveryCount", recoveryCount)
.detail("InfRecoveryCount", inf.recoveryCount)
.detail("RecoveryState", (int)inf.recoveryState)
.detail("LogSysConf", describe(inf.logSystemConfig.tLogs))
.detail("PriorLogs", describe(inf.priorCommittedLogServers))
.detail("OldLogGens", inf.logSystemConfig.oldTLogs.size());
// Read and cache cluster ID before displacing this tlog. We want
// to avoid removing the tlogs data if it has joined a new cluster
// with a different cluster ID.
// TODO: #5375
/*
state UID clusterId = wait(getClusterId(self));
ASSERT(clusterId.isValid());
self->ccClusterId = clusterId;
ev.detail("ClusterId", clusterId).detail("SelfClusterId", self->durableClusterId);
*/
if (BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFY_WORKER_REMOVED_MAX_LAG * deterministicRandom()->random01()));
throw worker_removed();
@ -2619,27 +2579,6 @@ ACTOR Future<Void> tLogEnablePopReq(TLogEnablePopRequest enablePopReq, TLogData*
return Void();
}
ACTOR Future<Void> updateDurableClusterID(TLogData* self) {
loop {
// Persist cluster ID once cluster has recovered.
if (self->dbInfo->get().recoveryState == RecoveryState::FULLY_RECOVERED) {
ASSERT(!self->durableClusterId.isValid());
state UID ccClusterId = self->dbInfo->get().client.clusterId;
self->durableClusterId = ccClusterId;
ASSERT(ccClusterId.isValid());
wait(self->persistentDataCommitLock.take());
state FlowLock::Releaser commitLockReleaser(self->persistentDataCommitLock);
self->persistentData->set(
KeyValueRef(persistClusterIdKey, BinaryWriter::toValue(ccClusterId, Unversioned())));
wait(self->persistentData->commit());
return Void();
}
wait(self->dbInfo->onChange());
}
}
ACTOR Future<Void> serveTLogInterface(TLogData* self,
TLogInterface tli,
Reference<LogData> logData,
@ -2930,26 +2869,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
@ -3027,7 +2966,6 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
state IKeyValueStore* storage = self->persistentData;
state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey);
state Future<Optional<Value>> fClusterId = storage->readValue(persistClusterIdKey);
state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
@ -3039,7 +2977,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
// FIXME: metadata in queue?
wait(waitForAll(std::vector{ fFormat, fRecoveryLocation, fClusterId }));
wait(waitForAll(std::vector{ fFormat, fRecoveryLocation }));
wait(waitForAll(std::vector{ fVers,
fKnownCommitted,
fLocality,
@ -3049,10 +2987,6 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
fProtocolVersions,
fTLogSpillTypes }));
if (fClusterId.get().present()) {
self->durableClusterId = BinaryReader::fromStringRef<UID>(fClusterId.get().get(), Unversioned());
}
if (fFormat.get().present() && !persistFormatReadableRange.contains(fFormat.get().get())) {
// FIXME: remove when we no longer need to test upgrades from 4.X releases
if (g_network->isSimulated()) {
@ -3315,7 +3249,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p
}
if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
e.code() == error_code_file_not_found || e.code() == error_code_invalid_cluster_id) {
e.code() == error_code_file_not_found) {
TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
return true;
} else
@ -3591,86 +3525,50 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
TraceEvent("SharedTlog", tlogId);
try {
try {
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
wait(ioTimeoutError(persistentData->init(), SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
} else {
wait(ioTimeoutError(checkEmptyQueue(&self) && initPersistentStorage(&self),
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
}
if (restoreFromDisk) {
wait(restorePersistentState(&self, locality, oldLog, recovered, tlogRequests));
} else {
wait(ioTimeoutError(checkEmptyQueue(&self) && initPersistentStorage(&self),
SERVER_KNOBS->TLOG_MAX_CREATE_DURATION));
}
// Disk errors need a chance to kill this actor.
wait(delay(0.000001));
// Disk errors need a chance to kill this actor.
wait(delay(0.000001));
if (recovered.canBeSet())
recovered.send(Void());
if (recovered.canBeSet())
recovered.send(Void());
if (!self.durableClusterId.isValid()) {
self.sharedActors.send(updateDurableClusterID(&self));
}
self.sharedActors.send(commitQueue(&self));
self.sharedActors.send(updateStorageLoop(&self));
self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
state Future<Void> activeSharedChange = Void();
self.sharedActors.send(commitQueue(&self));
self.sharedActors.send(updateStorageLoop(&self));
self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
state Future<Void> activeSharedChange = Void();
loop {
choose {
when(state InitializeTLogRequest req = waitNext(tlogRequests.getFuture())) {
if (!self.tlogCache.exists(req.recruitmentID)) {
self.tlogCache.set(req.recruitmentID, req.reply.getFuture());
self.sharedActors.send(
self.tlogCache.removeOnReady(req.recruitmentID, tLogStart(&self, req, locality)));
} else {
forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
}
}
when(wait(error)) { throw internal_error(); }
when(wait(activeSharedChange)) {
if (activeSharedTLog->get() == tlogId) {
TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());
self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
} else {
stopAllTLogs(&self, tlogId);
TraceEvent("SharedTLogQueueSpilling", self.dbgid)
.detail("NowActive", activeSharedTLog->get());
self.sharedActors.send(startSpillingInTenSeconds(&self, tlogId, activeSharedTLog));
}
activeSharedChange = activeSharedTLog->onChange();
loop {
choose {
when(state InitializeTLogRequest req = waitNext(tlogRequests.getFuture())) {
if (!self.tlogCache.exists(req.recruitmentID)) {
self.tlogCache.set(req.recruitmentID, req.reply.getFuture());
self.sharedActors.send(
self.tlogCache.removeOnReady(req.recruitmentID, tLogStart(&self, req, locality)));
} else {
forwardPromise(req.reply, self.tlogCache.get(req.recruitmentID));
}
}
when(wait(error)) { throw internal_error(); }
when(wait(activeSharedChange)) {
if (activeSharedTLog->get() == tlogId) {
TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get());
self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD;
} else {
stopAllTLogs(&self, tlogId);
TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get());
self.sharedActors.send(startSpillingInTenSeconds(&self, tlogId, activeSharedTLog));
}
activeSharedChange = activeSharedTLog->onChange();
}
}
} catch (Error& e) {
throw;
// TODO: #5375
/*
if (e.code() != error_code_worker_removed) {
throw;
}
// Don't need to worry about deleting data if there is no durable
// cluster ID.
if (!self.durableClusterId.isValid()) {
throw;
}
// When a tlog joins a new cluster and has data for an old cluster,
// it should automatically exclude itself to avoid being used in
// the new cluster.
auto recoveryState = self.dbInfo->get().recoveryState;
if (recoveryState == RecoveryState::FULLY_RECOVERED && self.ccClusterId.isValid() &&
self.durableClusterId.isValid() && self.ccClusterId != self.durableClusterId) {
state NetworkAddress address = g_network->getLocalAddress();
wait(excludeServers(self.cx, { AddressExclusion{ address.ip, address.port } }));
TraceEvent(SevWarnAlways, "TLogBelongsToExistingCluster")
.detail("ClusterId", self.durableClusterId)
.detail("NewClusterId", self.ccClusterId);
}
// If the tlog has a valid durable cluster ID, we don't want it to
// wipe its data! Throw this error to signal to `tlogTerminated` to
// close the persistent data store instead of deleting it.
throw invalid_cluster_id();
*/
}
} catch (Error& e) {
self.terminated.send(Void());

View File

@ -1635,7 +1635,6 @@ Future<Void> TagPartitionedLogSystem::endEpoch() {
Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
RecruitFromConfigurationReply const& recr,
Future<RecruitRemoteFromConfigurationReply> const& fRemoteWorkers,
UID clusterId,
DatabaseConfiguration const& config,
LogEpoch recoveryCount,
Version recoveryTransactionVersion,
@ -1646,7 +1645,6 @@ Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
return newEpoch(Reference<TagPartitionedLogSystem>::addRef(this),
recr,
fRemoteWorkers,
clusterId,
config,
recoveryCount,
recoveryTransactionVersion,
@ -2546,7 +2544,6 @@ std::vector<Tag> TagPartitionedLogSystem::getLocalTags(int8_t locality, const st
ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSystem* self,
Reference<TagPartitionedLogSystem> oldLogSystem,
Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers,
UID clusterId,
DatabaseConfiguration configuration,
LogEpoch recoveryCount,
Version recoveryTransactionVersion,
@ -2690,7 +2687,6 @@ ACTOR Future<Void> TagPartitionedLogSystem::newRemoteEpoch(TagPartitionedLogSyst
req.startVersion = logSet->startVersion;
req.logRouterTags = 0;
req.txsTags = self->txsTags;
req.clusterId = clusterId;
req.recoveryTransactionVersion = recoveryTransactionVersion;
}
@ -2742,7 +2738,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
Reference<TagPartitionedLogSystem> oldLogSystem,
RecruitFromConfigurationReply recr,
Future<RecruitRemoteFromConfigurationReply> fRemoteWorkers,
UID clusterId,
DatabaseConfiguration configuration,
LogEpoch recoveryCount,
Version recoveryTransactionVersion,
@ -2965,7 +2960,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
req.startVersion = logSystem->tLogs[0]->startVersion;
req.logRouterTags = logSystem->logRouterTags;
req.txsTags = logSystem->txsTags;
req.clusterId = clusterId;
req.recoveryTransactionVersion = recoveryTransactionVersion;
}
@ -3035,7 +3029,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
req.startVersion = oldLogSystem->knownCommittedVersion + 1;
req.logRouterTags = logSystem->logRouterTags;
req.txsTags = logSystem->txsTags;
req.clusterId = clusterId;
req.recoveryTransactionVersion = recoveryTransactionVersion;
}
@ -3094,7 +3087,6 @@ ACTOR Future<Reference<ILogSystem>> TagPartitionedLogSystem::newEpoch(
logSystem->remoteRecovery = TagPartitionedLogSystem::newRemoteEpoch(logSystem.getPtr(),
oldLogSystem,
fRemoteWorkers,
clusterId,
configuration,
recoveryCount,
recoveryTransactionVersion,

View File

@ -38,6 +38,7 @@
#include "flow/IRandom.h"
#include "flow/Knobs.h"
#include "flow/ObjectSerializer.h"
#include "flow/PriorityMultiLock.actor.h"
#include "flow/serialize.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
@ -105,210 +106,6 @@ std::string addPrefix(std::string prefix, std::string lines) {
return s;
}
#define PRIORITYMULTILOCK_DEBUG 0
// A multi user lock with a concurrent holder limit where waiters are granted the lock according to
// an integer priority from 0 to maxPriority, inclusive, where higher integers are given priority.
//
// The interface is similar to FlowMutex except that lock holders can drop the lock to release it.
//
// Usage:
// Lock lock = wait(prioritylock.lock(priorityLevel));
// lock.release(); // Explicit release, or
// // let lock and all copies of lock go out of scope to release
class PriorityMultiLock {
public:
// Waiting on the lock returns a Lock, which is really just a Promise<Void>
// Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release
// the Lock before it goes out of scope.
struct Lock {
void release() { promise.send(Void()); }
// This is exposed in case the caller wants to use/copy it directly
Promise<Void> promise;
};
private:
struct Waiter {
Waiter() : queuedTime(now()) {}
Promise<Lock> lockPromise;
double queuedTime;
};
typedef Deque<Waiter> Queue;
#if PRIORITYMULTILOCK_DEBUG
#define prioritylock_printf(...) printf(__VA_ARGS__)
#else
#define prioritylock_printf(...)
#endif
public:
PriorityMultiLock(int concurrency, int maxPriority, int launchLimit = std::numeric_limits<int>::max())
: concurrency(concurrency), available(concurrency), waiting(0), launchLimit(launchLimit) {
waiters.resize(maxPriority + 1);
fRunner = runner(this);
}
~PriorityMultiLock() { prioritylock_printf("destruct"); }
void kill() {
brokenOnDestruct.sendError(broken_promise());
fRunner.cancel();
runners.clear();
for (auto& w : waiters) {
w.clear();
}
}
Future<Lock> lock(int priority = 0) {
prioritylock_printf("lock begin %s\n", toString().c_str());
// This shortcut may enable a waiter to jump the line when the releaser loop yields
if (available > 0) {
--available;
Lock p;
addRunner(p);
prioritylock_printf("lock exit immediate %s\n", toString().c_str());
return p;
}
Waiter w;
waiters[priority].push_back(w);
++waiting;
prioritylock_printf("lock exit queued %s\n", toString().c_str());
return w.lockPromise.getFuture();
}
std::string toString() const {
int runnersDone = 0;
for (int i = 0; i < runners.size(); ++i) {
if (runners[i].isReady()) {
++runnersDone;
}
}
std::string s =
format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d runnersDone=%d ",
this,
concurrency,
available,
concurrency - available,
waiting,
runners.size(),
runnersDone);
for (int i = 0; i < waiters.size(); ++i) {
s += format("p%d_waiters=%u ", i, waiters[i].size());
}
s += "}";
return s;
}
private:
void addRunner(Lock& lock) {
runners.push_back(map(ready(lock.promise.getFuture()), [=](Void) {
prioritylock_printf("Lock released\n");
++available;
if (waiting > 0 || runners.size() > 100) {
release.trigger();
}
return Void();
}));
}
ACTOR static Future<Void> runner(PriorityMultiLock* self) {
state int sinceYield = 0;
state Future<Void> error = self->brokenOnDestruct.getFuture();
state int maxPriority = self->waiters.size() - 1;
// Priority to try to run tasks from next
state int priority = maxPriority;
state Queue* pQueue = &self->waiters[maxPriority];
// Track the number of waiters unlocked at the same priority in a row
state int lastPriorityCount = 0;
loop {
// Cleanup finished runner futures at the front of the runner queue.
while (!self->runners.empty() && self->runners.front().isReady()) {
self->runners.pop_front();
}
// Wait for a runner to release its lock
wait(self->release.onTrigger());
prioritylock_printf("runner wakeup %s\n", self->toString().c_str());
if (++sinceYield == 1000) {
sinceYield = 0;
wait(delay(0));
}
// While there are available slots and there are waiters, launch tasks
while (self->available > 0 && self->waiting > 0) {
prioritylock_printf("Checking priority=%d lastPriorityCount=%d %s\n",
priority,
lastPriorityCount,
self->toString().c_str());
while (!pQueue->empty() && ++lastPriorityCount < self->launchLimit) {
Waiter w = pQueue->front();
pQueue->pop_front();
--self->waiting;
Lock lock;
prioritylock_printf(" Running waiter priority=%d wait=%f %s\n",
priority,
now() - w.queuedTime,
self->toString().c_str());
w.lockPromise.send(lock);
// Self may have been destructed during the lock callback
if (error.isReady()) {
throw error.getError();
}
// If the lock was not already released, add it to the runners future queue
if (lock.promise.canBeSet()) {
self->addRunner(lock);
// A slot has been consumed, so stop reading from this queue if there aren't any more
if (--self->available == 0) {
break;
}
}
}
// If there are no more slots available, then don't move to the next priority
if (self->available == 0) {
break;
}
// Decrease priority, wrapping around to max from 0
if (priority == 0) {
priority = maxPriority;
} else {
--priority;
}
pQueue = &self->waiters[priority];
lastPriorityCount = 0;
}
}
}
int concurrency;
int available;
int waiting;
int launchLimit;
std::vector<Queue> waiters;
Deque<Future<Void>> runners;
Future<Void> fRunner;
AsyncTrigger release;
Promise<Void> brokenOnDestruct;
};
// Some convenience functions for debugging to stringify various structures
// Classes can add compatibility by either specializing toString<T> or implementing
// std::string toString() const;
@ -1677,6 +1474,8 @@ struct RedwoodMetrics {
kvSizeReadByGetRange = Reference<Histogram>(
new Histogram(Reference<HistogramRegistry>(), "kvSize", "ReadByGetRange", Histogram::Unit::bytes));
ioLock = nullptr;
// These histograms are used for Btree events, hence level > 0
unsigned int levelCounter = 0;
for (RedwoodMetrics::Level& level : levels) {
@ -1719,6 +1518,8 @@ struct RedwoodMetrics {
// btree levels and one extra level for non btree level.
Level levels[btreeLevels + 1];
metrics metric;
// pointer to the priority multi lock used in pager
PriorityMultiLock* ioLock;
Reference<Histogram> kvSizeWritten;
Reference<Histogram> kvSizeReadByGet;
@ -1773,9 +1574,12 @@ struct RedwoodMetrics {
// The string is a reasonably well formatted page of information
void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false);
void getIOLockFields(TraceEvent* e, std::string* s = nullptr);
std::string toString(bool clearAfter) {
std::string s;
getFields(nullptr, &s);
getIOLockFields(nullptr, &s);
if (clearAfter) {
clear();
@ -1810,6 +1614,7 @@ ACTOR Future<Void> redwoodMetricsLogger() {
double elapsed = now() - g_redwoodMetrics.startTime;
e.detail("Elapsed", elapsed);
g_redwoodMetrics.getFields(&e);
g_redwoodMetrics.getIOLockFields(&e);
g_redwoodMetrics.clear();
}
}
@ -2220,7 +2025,7 @@ public:
bool memoryOnly,
Reference<IPageEncryptionKeyProvider> keyProvider,
Promise<Void> errorPromise = {})
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2),
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS),
pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
@ -2232,6 +2037,7 @@ public:
// This sets the page cache size for all PageCacheT instances using the same evictor
pageCache.evictor().sizeLimit = pageCacheBytes;
g_redwoodMetrics.ioLock = &ioLock;
if (!g_redwoodMetricsActor.isValid()) {
g_redwoodMetricsActor = redwoodMetricsLogger();
}
@ -8121,8 +7927,7 @@ RedwoodRecordRef VersionedBTree::dbEnd("\xff\xff\xff\xff\xff"_sr);
class KeyValueStoreRedwood : public IKeyValueStore {
public:
KeyValueStoreRedwood(std::string filename, UID logID, Reference<IPageEncryptionKeyProvider> encryptionKeyProvider)
: m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, 0),
prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) {
: m_filename(filename), prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) {
int pageSize =
BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE;
@ -8187,6 +7992,8 @@ public:
ACTOR void shutdown(KeyValueStoreRedwood* self, bool dispose) {
TraceEvent(SevInfo, "RedwoodShutdown").detail("Filename", self->m_filename).detail("Dispose", dispose);
g_redwoodMetrics.ioLock = nullptr;
// In simulation, if the instance is being disposed of then sometimes run destructive sanity check.
if (g_network->isSimulated() && dispose && BUGGIFY) {
// Only proceed if the last commit is a success, but don't throw if it's not because shutdown
@ -8289,7 +8096,6 @@ public:
f.get();
} else {
CODE_PROBE(true, "Uncached forward range read seek");
wait(store(lock, self->m_concurrentReads.lock()));
wait(f);
}
@ -8345,7 +8151,6 @@ public:
f.get();
} else {
CODE_PROBE(true, "Uncached reverse range read seek");
wait(store(lock, self->m_concurrentReads.lock()));
wait(f);
}
@ -8412,9 +8217,6 @@ public:
wait(self->m_tree->initBTreeCursor(
&cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::PointRead, options));
// Not locking for point reads, instead relying on IO priority lock
// state PriorityMultiLock::Lock lock = wait(self->m_concurrentReads.lock());
++g_redwoodMetrics.metric.opGet;
wait(cur.seekGTE(key));
if (cur.isValid() && cur.get().key == key) {
@ -8450,7 +8252,6 @@ private:
Future<Void> m_init;
Promise<Void> m_closed;
Promise<Void> m_error;
PriorityMultiLock m_concurrentReads;
bool prefetch;
Version m_nextCommitVersion;
Reference<IPageEncryptionKeyProvider> m_keyProvider;
@ -9086,6 +8887,43 @@ void RedwoodMetrics::getFields(TraceEvent* e, std::string* s, bool skipZeroes) {
}
}
void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
if (ioLock == nullptr)
return;
int maxPriority = ioLock->maxPriority();
if (e != nullptr) {
e->detail("ActiveReads", ioLock->totalRunners());
e->detail("AwaitReads", ioLock->totalWaiters());
for (int priority = 0; priority <= maxPriority; ++priority) {
e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority));
e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority));
}
}
if (s != nullptr) {
std::string active = "Active";
std::string await = "Await";
*s += "\n";
*s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners());
*s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters());
*s += "\n";
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
}
*s += "\n";
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
}
}
}
TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3);
ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4);
@ -11569,3 +11407,57 @@ TEST_CASE(":/redwood/performance/histograms") {
return Void();
}
ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
wait(delay(deterministicRandom()->random01() * .1));
++*pout;
return Void();
}
TEST_CASE("/redwood/PriorityMultiLock") {
state std::vector<int> priorities = { 10, 20, 40 };
state int concurrency = 25;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks at each level
state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
for (int i = 0; i < priorities.size(); ++i) {
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(i));
}
}
// Wait for n = concurrency locks to be acquired
wait(quorum(lockFutures, concurrency));
state std::vector<Future<Void>> futures;
for (int i = 0; i < 10e3; ++i) {
int p = i % priorities.size();
futures.push_back(waitLockIncrement(pml, p, &counts[p]));
}
state Future<Void> f = waitForAll(futures);
// Release the locks
lockFutures.clear();
// Print stats and wait for all futures to be ready
loop {
choose {
when(wait(delay(1))) {
printf("counts: ");
for (auto c : counts) {
printf("%d ", c);
}
printf(" pml: %s\n", pml->toString().c_str());
}
when(wait(f)) { break; }
}
}
delete pml;
return Void();
}

View File

@ -3341,6 +3341,7 @@ public:
AsyncVar<std::pair<bool, Optional<std::vector<Optional<Key>>>>>
changedDcIds; // current DC priorities to change second, and whether the cluster controller has been changed
UID id;
Reference<AsyncVar<Optional<UID>>> clusterId;
std::vector<Reference<RecruitWorkersInfo>> outstandingRecruitmentRequests;
std::vector<Reference<RecruitRemoteWorkersInfo>> outstandingRemoteRecruitmentRequests;
std::vector<std::pair<RecruitStorageRequest, double>> outstandingStorageRequests;
@ -3412,15 +3413,16 @@ public:
ClusterControllerData(ClusterControllerFullInterface const& ccInterface,
LocalityData const& locality,
ServerCoordinators const& coordinators)
ServerCoordinators const& coordinators,
Reference<AsyncVar<Optional<UID>>> clusterId)
: gotProcessClasses(false), gotFullyRecoveredConfig(false), shouldCommitSuicide(false),
clusterControllerProcessId(locality.processId()), clusterControllerDcId(locality.dcId()), id(ccInterface.id()),
ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()), startTime(now()),
goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()), datacenterVersionDifference(0),
versionDifferenceUpdated(false), remoteDCMonitorStarted(false), remoteTransactionSystemDegraded(false),
recruitDistributor(false), recruitRatekeeper(false), recruitBlobManager(false), recruitBlobMigrator(false),
recruitEncryptKeyProxy(false), recruitConsistencyScan(false),
clusterControllerMetrics("ClusterController", id.toString()),
clusterId(clusterId), ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()),
startTime(now()), goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()),
datacenterVersionDifference(0), versionDifferenceUpdated(false), remoteDCMonitorStarted(false),
remoteTransactionSystemDegraded(false), recruitDistributor(false), recruitRatekeeper(false),
recruitBlobManager(false), recruitBlobMigrator(false), recruitEncryptKeyProxy(false),
recruitConsistencyScan(false), clusterControllerMetrics("ClusterController", id.toString()),
openDatabaseRequests("OpenDatabaseRequests", clusterControllerMetrics),
registerWorkerRequests("RegisterWorkerRequests", clusterControllerMetrics),
getWorkersRequests("GetWorkersRequests", clusterControllerMetrics),

View File

@ -289,11 +289,10 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME));
clusterRecoveryAvailableEventHolder = makeReference<EventCacheHolder>(
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_AVAILABLE_EVENT_NAME));
logger = traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
&cc,
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
logger = cc.traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
if (forceRecovery && !controllerData->clusterControllerDcId.present()) {
TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
forceRecovery = false;

View File

@ -468,8 +468,6 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
bool recruitTss,
Reference<TSSPairState> tssState);
Future<UID> getClusterId();
// return the next ServerID in storageWiggler
Future<UID> getNextWigglingServerID();

View File

@ -117,6 +117,7 @@ public:
virtual Future<Void> moveKeys(const MoveKeysParams& params) = 0;
// metrics.second is the number of key-ranges (i.e., shards) in the 'keys' key-range
virtual Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
@ -136,8 +137,6 @@ public:
virtual Future<Optional<Value>> readRebalanceDDIgnoreKey() const { return {}; }
virtual Future<UID> getClusterId() const { return {}; }
virtual Future<Void> waitDDTeamInfoPrintSignal() const { return Never(); }
virtual Future<std::vector<ProcessData>> getWorkers() const = 0;
@ -221,8 +220,6 @@ public:
Future<Optional<Value>> readRebalanceDDIgnoreKey() const override;
Future<UID> getClusterId() const override;
Future<Void> waitDDTeamInfoPrintSignal() const override;
Future<std::vector<ProcessData>> getWorkers() const override;

View File

@ -476,6 +476,8 @@ struct ShardSizeBounds {
bool operator==(ShardSizeBounds const& rhs) const {
return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
}
static ShardSizeBounds shardSizeBoundsBeforeTrack();
};
// Gets the permitted size and IO bounds for a shard

View File

@ -46,6 +46,7 @@ class GrvProxyTransactionTagThrottler {
: req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {}
void updateProxyTagThrottledDuration();
bool isMaxThrottled() const;
};
struct TagQueue {
@ -56,6 +57,8 @@ class GrvProxyTransactionTagThrottler {
explicit TagQueue(double rate) : rateInfo(rate) {}
void setRate(double rate);
bool isMaxThrottled() const;
void rejectRequests();
};
// Track the budgets for each tag
@ -69,8 +72,8 @@ public:
// If a request is ready to be executed, it is sent to the deque
// corresponding to its priority. If not, the request remains queued.
void releaseTransactions(double elapsed,
SpannedDeque<GetReadVersionRequest>& outBatchPriority,
SpannedDeque<GetReadVersionRequest>& outDefaultPriority);
Deque<GetReadVersionRequest>& outBatchPriority,
Deque<GetReadVersionRequest>& outDefaultPriority);
void addRequest(GetReadVersionRequest const&);

View File

@ -29,7 +29,7 @@
#include "fdbserver/IClosable.h"
#include "fdbserver/IPageEncryptionKeyProvider.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
struct CheckpointRequest {
const Version version; // The FDB version at which the checkpoint is created.

View File

@ -641,7 +641,6 @@ struct ILogSystem {
virtual Future<Reference<ILogSystem>> newEpoch(
RecruitFromConfigurationReply const& recr,
Future<struct RecruitRemoteFromConfigurationReply> const& fRemoteWorkers,
UID clusterId,
DatabaseConfiguration const& config,
LogEpoch recoveryCount,
Version recoveryTransactionVersion,

View File

@ -21,10 +21,11 @@
#ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H
#define FOUNDATIONDB_MOCKGLOBALSTATE_H
#include "StorageMetrics.h"
#include "StorageMetrics.actor.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/KeyLocationService.h"
#include "SimulatedCluster.h"
#include "ShardsAffectedByTeamFailure.h"
@ -51,9 +52,11 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
return false;
}
class MockStorageServer {
class MockStorageServer : public IStorageMetricsService {
friend struct MockGlobalStateTester;
ActorCollection actors;
public:
struct ShardInfo {
MockShardStatus status;
@ -73,8 +76,6 @@ public:
// size() and nthRange() would use the metrics as index instead
KeyRangeMap<ShardInfo> serverKeys;
// sampled metrics
StorageServerMetrics metrics;
CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;
StorageServerInterface ssi; // serve RPC requests
@ -103,6 +104,35 @@ public:
uint64_t sumRangeSize(KeyRangeRef range) const;
void addActor(Future<Void> future) override;
void getSplitPoints(SplitRangeRequest const& req) override;
Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) override;
void getStorageMetrics(const GetStorageMetricsRequest& req) override;
template <class Reply>
static constexpr bool isLoadBalancedReply = std::is_base_of_v<LoadBalancedReply, Reply>;
template <class Reply>
typename std::enable_if_t<isLoadBalancedReply<Reply>, void> sendErrorWithPenalty(const ReplyPromise<Reply>& promise,
const Error& err,
double penalty) {
Reply reply;
reply.error = err;
reply.penalty = penalty;
promise.send(reply);
}
template <class Reply>
typename std::enable_if_t<!isLoadBalancedReply<Reply>, void>
sendErrorWithPenalty(const ReplyPromise<Reply>& promise, const Error& err, double) {
promise.sendError(err);
}
Future<Void> run();
protected:
void threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
@ -112,8 +142,13 @@ protected:
void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize);
};
class MockGlobalState {
class MockGlobalStateImpl;
class MockGlobalState : public IKeyLocationService {
friend struct MockGlobalStateTester;
friend class MockGlobalStateImpl;
std::vector<StorageServerInterface> extractStorageServerInterfaces(const std::vector<UID>& ids) const;
public:
typedef ShardsAffectedByTeamFailure::Team Team;
@ -162,7 +197,37 @@ public:
* * mgs.shardMapping doesnt have any information about X
* * mgs.allServer[X] is existed
*/
bool allShardRemovedFromServer(const UID& serverId);
bool allShardsRemovedFromServer(const UID& serverId);
// SOMEDAY: NativeAPI::waitStorageMetrics should share the code in the future, this is a simpler version of it
Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount);
Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(const KeyRange& keys,
const StorageMetrics& limit,
const StorageMetrics& estimated,
const Optional<int>& minSplitBytes);
Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
Key key,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Reverse isBackward,
Version version) override;
Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
KeyRange keys,
int limit,
Reverse reverse,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Version version) override;
};
#endif // FOUNDATIONDB_MOCKGLOBALSTATE_H

View File

@ -156,10 +156,20 @@ struct ProxyStats {
specialCounter(cc, "NumTenants", [pTenantMap]() { return pTenantMap ? pTenantMap->size() : 0; });
specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); });
specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); });
logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics");
logger = cc.traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ProxyMetrics");
}
};
struct ExpectedIdempotencyIdCountForKey {
Version commitVersion = invalidVersion;
int16_t idempotencyIdCount = 0;
uint8_t batchIndexHighByte = 0;
ExpectedIdempotencyIdCountForKey() {}
ExpectedIdempotencyIdCountForKey(Version commitVersion, int16_t idempotencyIdCount, uint8_t batchIndexHighByte)
: commitVersion(commitVersion), idempotencyIdCount(idempotencyIdCount), batchIndexHighByte(batchIndexHighByte) {}
};
struct ProxyCommitData {
UID dbgid;
int64_t commitBatchesMemBytesCount;
@ -226,6 +236,9 @@ struct ProxyCommitData {
bool isEncryptionEnabled = false;
PromiseStream<ExpectedIdempotencyIdCountForKey> expectedIdempotencyIdCountForKey;
Standalone<VectorRef<MutationRef>> idempotencyClears;
// The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly
// more CPU efficient. When a tag related to a storage server does change, we empty out all of these vectors to
// signify they must be repopulated. We do not repopulate them immediately to avoid a slow task.

Some files were not shown because too many files have changed in this diff Show More