Merge branch 'main' of github.com:apple/foundationdb into tenant-list-filter

This commit is contained in:
Jon Fu 2022-10-26 15:01:46 -07:00
commit 886c286297
103 changed files with 2941 additions and 1381 deletions

View File

@ -59,6 +59,8 @@
#include "shm.hpp"
#include "stats.hpp"
#include "time.hpp"
#include "rapidjson/document.h"
#include "rapidjson/error/en.h"
namespace mako {
@ -88,14 +90,29 @@ Transaction createNewTransaction(Database db, Arguments const& args, int id = -1
}
// Create Tenant Transaction
int tenant_id = (id == -1) ? urand(0, args.active_tenants - 1) : id;
Transaction tr;
std::string tenantStr;
// If provided tenants array, use it
if (tenants) {
return tenants[tenant_id].createTransaction();
tr = tenants[tenant_id].createTransaction();
} else {
tenantStr = "tenant" + std::to_string(tenant_id);
BytesRef tenant_name = toBytesRef(tenantStr);
Tenant t = db.openTenant(tenant_name);
tr = t.createTransaction();
}
std::string tenantStr = "tenant" + std::to_string(tenant_id);
BytesRef tenant_name = toBytesRef(tenantStr);
Tenant t = db.openTenant(tenant_name);
return t.createTransaction();
if (!args.authorization_tokens.empty()) {
// lookup token based on tenant name and, if found, set authz token to transaction
if (tenantStr.empty())
tenantStr = "tenant" + std::to_string(tenant_id);
auto tokenMapItr = args.authorization_tokens.find(tenantStr);
if (tokenMapItr != args.authorization_tokens.end()) {
tr.setOption(FDB_TR_OPTION_AUTHORIZATION_TOKEN, tokenMapItr->second);
} else {
logr.warn("Authorization token map is not empty, but could not find token for tenant '{}'", tenantStr);
}
}
return tr;
}
uint64_t byteswapHelper(uint64_t input) {
@ -815,6 +832,18 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
logr.error("network::setOption(FDB_NET_OPTION_DISTRIBUTED_CLIENT_TRACER): {}", err.what());
}
if (args.tls_certificate_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_CERT_PATH, args.tls_certificate_file.value());
}
if (args.tls_key_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_KEY_PATH, args.tls_key_file.value());
}
if (args.tls_ca_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_CA_PATH, args.tls_ca_file.value());
}
/* enable flatbuffers if specified */
if (args.flatbuffers) {
#ifdef FDB_NET_OPTION_USE_FLATBUFFERS
@ -982,57 +1011,55 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
}
/* initialize the parameters with default values */
int initArguments(Arguments& args) {
memset(&args, 0, sizeof(Arguments)); /* zero-out everything */
args.num_fdb_clusters = 0;
args.num_databases = 1;
args.api_version = maxApiVersion();
args.json = 0;
args.num_processes = 1;
args.num_threads = 1;
args.async_xacts = 0;
args.mode = MODE_INVALID;
args.rows = 100000;
args.load_factor = 1.0;
args.row_digits = digits(args.rows);
args.seconds = 30;
args.iteration = 0;
args.tpsmax = 0;
args.tpsmin = -1;
args.tpsinterval = 10;
args.tpschange = TPS_SIN;
args.sampling = 1000;
args.key_length = 32;
args.value_length = 16;
args.active_tenants = 0;
args.total_tenants = 0;
args.tenant_batch_size = 10000;
args.zipf = 0;
args.commit_get = 0;
args.verbose = 1;
args.flatbuffers = 0; /* internal */
args.knobs[0] = '\0';
args.log_group[0] = '\0';
args.prefixpadding = 0;
args.trace = 0;
args.tracepath[0] = '\0';
args.traceformat = 0; /* default to client's default (XML) */
args.streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
args.txntrace = 0;
args.txntagging = 0;
memset(args.txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
Arguments::Arguments() {
num_fdb_clusters = 0;
num_databases = 1;
api_version = maxApiVersion();
json = 0;
num_processes = 1;
num_threads = 1;
async_xacts = 0;
mode = MODE_INVALID;
rows = 100000;
load_factor = 1.0;
row_digits = digits(rows);
seconds = 30;
iteration = 0;
tpsmax = 0;
tpsmin = -1;
tpsinterval = 10;
tpschange = TPS_SIN;
sampling = 1000;
key_length = 32;
value_length = 16;
active_tenants = 0;
total_tenants = 0;
tenant_batch_size = 10000;
zipf = 0;
commit_get = 0;
verbose = 1;
flatbuffers = 0; /* internal */
knobs[0] = '\0';
log_group[0] = '\0';
prefixpadding = 0;
trace = 0;
tracepath[0] = '\0';
traceformat = 0; /* default to client's default (XML) */
streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
txntrace = 0;
txntagging = 0;
memset(txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
for (auto i = 0; i < MAX_OP; i++) {
args.txnspec.ops[i][OP_COUNT] = 0;
txnspec.ops[i][OP_COUNT] = 0;
}
args.client_threads_per_version = 0;
args.disable_client_bypass = false;
args.disable_ryw = 0;
args.json_output_path[0] = '\0';
args.stats_export_path[0] = '\0';
args.bg_materialize_files = false;
args.bg_file_path[0] = '\0';
args.distributed_tracer_client = 0;
return 0;
client_threads_per_version = 0;
disable_client_bypass = false;
disable_ryw = 0;
json_output_path[0] = '\0';
stats_export_path[0] = '\0';
bg_materialize_files = false;
bg_file_path[0] = '\0';
distributed_tracer_client = 0;
}
/* parse transaction specification */
@ -1279,6 +1306,10 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "bg_file_path", required_argument, NULL, ARG_BG_FILE_PATH },
{ "stats_export_path", optional_argument, NULL, ARG_EXPORT_PATH },
{ "distributed_tracer_client", required_argument, NULL, ARG_DISTRIBUTED_TRACER_CLIENT },
{ "tls_certificate_file", required_argument, NULL, ARG_TLS_CERTIFICATE_FILE },
{ "tls_key_file", required_argument, NULL, ARG_TLS_KEY_FILE },
{ "tls_ca_file", required_argument, NULL, ARG_TLS_CA_FILE },
{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
{ NULL, 0, NULL, 0 }
};
idx = 0;
@ -1515,6 +1546,45 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.distributed_tracer_client = -1;
}
break;
case ARG_TLS_CERTIFICATE_FILE:
args.tls_certificate_file = std::string(optarg);
break;
case ARG_TLS_KEY_FILE:
args.tls_key_file = std::string(optarg);
break;
case ARG_TLS_CA_FILE:
args.tls_ca_file = std::string(optarg);
break;
case ARG_AUTHORIZATION_TOKEN_FILE: {
std::string tokenFilename(optarg);
std::ifstream ifs(tokenFilename);
std::ostringstream oss;
oss << ifs.rdbuf();
rapidjson::Document d;
d.Parse(oss.str().c_str());
if (d.HasParseError()) {
logr.error("Failed to parse authorization token JSON file '{}': {} at offset {}",
tokenFilename,
GetParseError_En(d.GetParseError()),
d.GetErrorOffset());
return -1;
} else if (!d.IsObject()) {
logr.error("Authorization token JSON file '{}' must contain a JSON object", tokenFilename);
return -1;
}
for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
if (!itr->value.IsString()) {
logr.error("Token '{}' is not a string", itr->name.GetString());
return -1;
}
args.authorization_tokens.insert_or_assign(
std::string(itr->name.GetString(), itr->name.GetStringLength()),
std::string(itr->value.GetString(), itr->value.GetStringLength()));
}
logr.info("Added {} tenant authorization tokens to map from file '{}'",
args.authorization_tokens.size(),
tokenFilename);
} break;
}
}
@ -1525,93 +1595,97 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
return 0;
}
int validateArguments(Arguments const& args) {
if (args.mode == MODE_INVALID) {
int Arguments::validate() {
if (mode == MODE_INVALID) {
logr.error("--mode has to be set");
return -1;
}
if (args.verbose < VERBOSE_NONE || args.verbose > VERBOSE_DEBUG) {
if (verbose < VERBOSE_NONE || verbose > VERBOSE_DEBUG) {
logr.error("--verbose must be between 0 and 3");
return -1;
}
if (args.rows <= 0) {
if (rows <= 0) {
logr.error("--rows must be a positive integer");
return -1;
}
if (args.load_factor <= 0 || args.load_factor > 1) {
if (load_factor <= 0 || load_factor > 1) {
logr.error("--load_factor must be in range (0, 1]");
return -1;
}
if (args.key_length < 0) {
if (key_length < 0) {
logr.error("--keylen must be a positive integer");
return -1;
}
if (args.value_length < 0) {
if (value_length < 0) {
logr.error("--vallen must be a positive integer");
return -1;
}
if (args.num_fdb_clusters > NUM_CLUSTERS_MAX) {
if (num_fdb_clusters > NUM_CLUSTERS_MAX) {
logr.error("Mako is not supported to do work to more than {} clusters", NUM_CLUSTERS_MAX);
return -1;
}
if (args.num_databases > NUM_DATABASES_MAX) {
if (num_databases > NUM_DATABASES_MAX) {
logr.error("Mako is not supported to do work to more than {} databases", NUM_DATABASES_MAX);
return -1;
}
if (args.num_databases < args.num_fdb_clusters) {
logr.error("--num_databases ({}) must be >= number of clusters({})", args.num_databases, args.num_fdb_clusters);
if (num_databases < num_fdb_clusters) {
logr.error("--num_databases ({}) must be >= number of clusters({})", num_databases, num_fdb_clusters);
return -1;
}
if (args.num_threads < args.num_databases) {
logr.error("--threads ({}) must be >= number of databases ({})", args.num_threads, args.num_databases);
if (num_threads < num_databases) {
logr.error("--threads ({}) must be >= number of databases ({})", num_threads, num_databases);
return -1;
}
if (args.key_length < 4 /* "mako" */ + args.row_digits) {
if (key_length < 4 /* "mako" */ + row_digits) {
logr.error("--keylen must be larger than {} to store \"mako\" prefix "
"and maximum row number",
4 + args.row_digits);
4 + row_digits);
return -1;
}
if (args.active_tenants > args.total_tenants) {
if (active_tenants > total_tenants) {
logr.error("--active_tenants must be less than or equal to --total_tenants");
return -1;
}
if (args.tenant_batch_size < 1) {
if (tenant_batch_size < 1) {
logr.error("--tenant_batch_size must be at least 1");
return -1;
}
if (args.mode == MODE_RUN) {
if ((args.seconds > 0) && (args.iteration > 0)) {
if (mode == MODE_RUN) {
if ((seconds > 0) && (iteration > 0)) {
logr.error("Cannot specify seconds and iteration together");
return -1;
}
if ((args.seconds == 0) && (args.iteration == 0)) {
if ((seconds == 0) && (iteration == 0)) {
logr.error("Must specify either seconds or iteration");
return -1;
}
if (args.txntagging < 0) {
if (txntagging < 0) {
logr.error("--txntagging must be a non-negative integer");
return -1;
}
}
// ensure that all of the files provided to mako are valid and exist
if (args.mode == MODE_REPORT) {
if (!args.num_report_files) {
if (mode == MODE_REPORT) {
if (!num_report_files) {
logr.error("No files to merge");
}
for (int i = 0; i < args.num_report_files; i++) {
for (int i = 0; i < num_report_files; i++) {
struct stat buffer;
if (stat(args.report_files[i], &buffer) != 0) {
logr.error("Couldn't open file {}", args.report_files[i]);
if (stat(report_files[i], &buffer) != 0) {
logr.error("Couldn't open file {}", report_files[i]);
return -1;
}
}
}
if (args.distributed_tracer_client < 0) {
logr.error("--disibuted_tracer_client must specify either (disabled, network_lossy, log_file)");
if (distributed_tracer_client < 0) {
logr.error("--distributed_tracer_client must specify either (disabled, network_lossy, log_file)");
return -1;
}
if (!authorization_tokens.empty() && !tls_ca_file.has_value()) {
logr.warn("Authorization tokens are being used without explicit TLS CA file configured");
}
return 0;
}
@ -2262,11 +2336,6 @@ int main(int argc, char* argv[]) {
auto rc = int{};
auto args = Arguments{};
rc = initArguments(args);
if (rc < 0) {
logr.error("initArguments failed");
return -1;
}
rc = parseArguments(argc, argv, args);
if (rc < 0) {
/* usage printed */
@ -2282,7 +2351,7 @@ int main(int argc, char* argv[]) {
args.total_tenants = args.active_tenants;
}
rc = validateArguments(args);
rc = args.validate();
if (rc < 0)
return -1;
logr.setVerbosity(args.verbose);

View File

@ -30,6 +30,7 @@
#include <cassert>
#include <chrono>
#include <list>
#include <map>
#include <vector>
#include <string_view>
#include <fdb_api.hpp>
@ -79,7 +80,11 @@ enum ArgKind {
ARG_JSON_REPORT,
ARG_BG_FILE_PATH, // if blob granule files are stored locally, mako will read and materialize them if this is set
ARG_EXPORT_PATH,
ARG_DISTRIBUTED_TRACER_CLIENT
ARG_DISTRIBUTED_TRACER_CLIENT,
ARG_TLS_CERTIFICATE_FILE,
ARG_TLS_KEY_FILE,
ARG_TLS_CA_FILE,
ARG_AUTHORIZATION_TOKEN_FILE,
};
constexpr const int OP_COUNT = 0;
@ -131,6 +136,9 @@ constexpr const int MAX_REPORT_FILES = 200;
/* benchmark parameters */
struct Arguments {
Arguments();
int validate();
int api_version;
int json;
int num_processes;
@ -180,6 +188,10 @@ struct Arguments {
char report_files[MAX_REPORT_FILES][PATH_MAX];
int num_report_files;
int distributed_tracer_client;
std::optional<std::string> tls_certificate_file;
std::optional<std::string> tls_key_file;
std::optional<std::string> tls_ca_file;
std::map<std::string, std::string> authorization_tokens; // maps tenant name to token string
};
} // namespace mako

View File

@ -38,7 +38,7 @@ Arguments
| - ``build``: Populate data
| - ``run``: Run the benchmark
- | ``-c | --cluster <cluster file>``
- | ``-c | --cluster <cluster_file>``
| FDB cluster files (Required, comma-separated)
- | ``-d | --num_databases <num_databases>``
@ -125,9 +125,21 @@ Arguments
| Disable snapshot read-your-writes
- | ``--json_report`` defaults to ``mako.json``
| ``--json_report=PATH``
| ``--json_report <path>``
| Output stats to the specified json file
- | ``--tls_certificate_file <path>``
| Use TLS certificate located in ``<path>``
- | ``--tls_key_file <path>``
| Use TLS key file located in ``<path>``
- | ``--tls_ca_file <path>``
| Use TLS CA file located in ``<path>``
- | ``--authorization_token_file <path>``
| Use authorization token JSON file located in ``<path>``
| Expected content is a JSON object where each key is a tenant name and the mapped value is a token string
Transaction Specification
=========================

View File

@ -76,38 +76,11 @@ function(generate_coverage_xml)
add_dependencies(coverage_${target_name} coveragetool)
endfunction()
# This function asserts that `versions.h` does not exist in the source
# directory. It does this in the prebuild phase of the target.
# This is an ugly hack that should make sure that cmake isn't used with
# a source directory in which FDB was previously built with `make`.
function(assert_no_version_h target)
message(STATUS "Check versions.h on ${target}")
set(target_name "${target}_versions_h_check")
if (DEFINED ENV{VERBOSE})
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMAND echo
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-DFILE="${CMAKE_SOURCE_DIR}/versions.h"
COMMENT "Check old build system wasn't used in source dir")
else()
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMENT "Check old build system wasn't used in source dir")
endif()
add_dependencies(${target} ${target_name})
endfunction()
add_custom_target(strip_targets)
add_dependencies(packages strip_targets)
function(strip_debug_symbols target)
if (WIN32)
if(WIN32)
return()
endif()
get_target_property(target_type ${target} TYPE)
@ -146,7 +119,7 @@ function(strip_debug_symbols target)
COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
COMMENT "Copy debug symbols to ${out_name}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
else()
add_custom_target(strip_${target})
add_dependencies(strip_${target} strip_only_${target})
@ -171,7 +144,7 @@ function(copy_headers)
foreach(f IN LISTS CP_SRCS)
is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}")
is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}")
if (bd OR sd)
if(bd OR sd)
continue()
endif()
is_header(hdr "${f}")
@ -180,7 +153,7 @@ function(copy_headers)
endif()
get_filename_component(fname ${f} NAME)
get_filename_component(dname ${f} DIRECTORY)
if (dname)
if(dname)
make_directory(${incl_dir}/${dname})
endif()
set(fpath "${incl_dir}/${dname}/${fname}")
@ -309,9 +282,6 @@ function(add_flow_target)
add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files})
add_dependencies(${AFT_NAME} ${AFT_NAME}_actors)
if(NOT WIN32)
assert_no_version_h(${AFT_NAME}_actors)
endif()
generate_coverage_xml(${AFT_NAME})
if(strip_target)
strip_debug_symbols(${AFT_NAME})

View File

@ -8,40 +8,43 @@ endif()
include(ExternalProject)
ExternalProject_Add(awssdk_project
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
# it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt.
# This option forces to cmake to build the aws sdk only once and never attempt to update it
UPDATE_DISCONNECTED ON
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
add_library(awssdk_core STATIC IMPORTED)
add_dependencies(awssdk_core awssdk_project)

View File

@ -159,13 +159,20 @@ class Parser:
pass
class XmlParser(Parser, xml.sax.handler.ContentHandler):
class XmlParser(Parser, xml.sax.handler.ContentHandler, xml.sax.handler.ErrorHandler):
def __init__(self):
super().__init__()
self.handler: ParseHandler | None = None
def parse(self, file: TextIO, handler: ParseHandler) -> None:
xml.sax.parse(file, self)
self.handler = handler
xml.sax.parse(file, self, errorHandler=self)
def error(self, exception):
pass
def fatalError(self, exception):
pass
def startElement(self, name, attrs) -> None:
attributes: Dict[str, str] = {}
@ -276,6 +283,7 @@ class TraceFiles:
raise StopIteration
self.current += 1
return self.trace_files[self.current - 1]
return TraceFilesIterator(self)
@ -426,7 +434,8 @@ class Summary:
lines = self.error_out.splitlines()
stderr_bytes = 0
for line in lines:
if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
if line.endswith(
"WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"):
# When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives.
continue
if line.endswith("Warning: unimplemented fcntl command: 1036"):
@ -560,6 +569,9 @@ class Summary:
self.handler.add_handler(('Severity', '30'), parse_warning)
def parse_error(attrs: Dict[str, str]):
if 'ErrorIsInjectedFault' in attrs and attrs['ErrorIsInjectedFault'].lower() in ['1', 'true']:
# ignore injected errors. In newer fdb versions these will have a lower severity
return
self.errors += 1
self.error = True
if self.errors > config.max_errors:
@ -606,6 +618,7 @@ class Summary:
child.attributes['File'] = attrs['File']
child.attributes['Line'] = attrs['Line']
self.out.append(child)
self.handler.add_handler(('Type', 'BuggifySection'), buggify_section)
self.handler.add_handler(('Type', 'FaultInjected'), buggify_section)
@ -614,9 +627,11 @@ class Summary:
child.attributes['Name'] = attrs['Name']
child.attributes['File'] = attrs['File']
child.attributes['Line'] = attrs['Line']
self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test)
def stderr_severity(attrs: Dict[str, str]):
if 'NewSeverity' in attrs:
self.stderr_severity = attrs['NewSeverity']
self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity)

View File

@ -2365,6 +2365,7 @@ ACTOR Future<Void> runRestore(Database db,
KeyRef(addPrefix),
KeyRef(removePrefix),
LockDB::True,
UnlockDB::True,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,

View File

@ -83,7 +83,7 @@ BlobCipherMetrics::BlobCipherMetrics()
CounterSet(cc, "Backup"),
CounterSet(cc, "Test") }) {
specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc);
traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
}
std::string toString(BlobCipherMetrics::UsageType type) {

View File

@ -142,7 +142,6 @@ bool isRangeFullyCovered(KeyRange range, Standalone<VectorRef<BlobGranuleChunkRe
for (const BlobGranuleChunkRef& chunk : blobChunks) {
blobRanges.push_back(chunk.keyRange);
}
return range.isCovered(blobRanges);
}
@ -194,7 +193,7 @@ TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") {
testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks);
testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks);
testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks);
ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false);
ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks));
}
return Void();
}

View File

@ -272,6 +272,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0;
init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
// busyness reporting
init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 );

View File

@ -22,6 +22,16 @@
#include "fdbclient/Knobs.h"
#include "fdbclient/NativeAPI.actor.h"
KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix) {
if (prefix.empty()) {
return range;
} else {
KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
return KeyRangeRef(begin, end);
}
}
KeyRef keyBetween(const KeyRangeRef& keys) {
int pos = 0; // will be the position of the first difference between keys.begin and keys.end
int minSize = std::min(keys.begin.size(), keys.end.size());

View File

@ -167,6 +167,7 @@ public:
KeyBackedProperty<Key> removePrefix() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); }
// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<std::vector<KeyRange>> restoreRanges() { return configSpace.pack(__FUNCTION__sr); }
@ -591,12 +592,11 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
ACTOR static Future<StringRef> decryptImpl(Database cx,
StringRef headerS,
BlobCipherEncryptHeader header,
const uint8_t* dataP,
int64_t dataLen,
Arena* arena) {
Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
@ -606,7 +606,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
static Future<StringRef> decrypt(Database cx,
StringRef headerS,
BlobCipherEncryptHeader headerS,
const uint8_t* dataP,
int64_t dataLen,
Arena* arena) {
@ -651,7 +651,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) {
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self));
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self->tenantCache));
state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;
// Get text and header cipher key
@ -693,12 +693,13 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; }
ACTOR static Future<std::pair<int64_t, TenantName>>
getEncryptionDomainDetailsImpl(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache, bool useTenantCache) {
ACTOR static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetailsImpl(
KeyRef key,
Reference<TenantEntryCache<Void>> tenantCache) {
if (isSystemKey(key)) {
return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) {
if (key.size() < TENANT_PREFIX_SIZE) {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
}
KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE);
@ -710,21 +711,10 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
}
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(KeyRef key,
EncryptedRangeFileWriter* self) {
// If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of
// unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may
// degrade if most of the mutations belong to an invalid tenant
TenantMode mode = self->cx->clientInfo->get().tenantMode;
bool useTenantCache = mode != TenantMode::DISABLED;
if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) {
// TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any
// tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode
// support with backups is more performant
useTenantCache = false;
}
CODE_PROBE(useTenantCache, "using tenant cache");
return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache);
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(
KeyRef key,
Reference<TenantEntryCache<Void>> tenantCache) {
return getEncryptionDomainDetailsImpl(key, tenantCache);
}
// Handles the first block and internal blocks. Ends current block if needed.
@ -816,6 +806,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE);
}
state ValueRef newValue = StringRef();
self->lastKey = k;
self->lastValue = v;
@ -834,9 +825,9 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
if (self->lastKey.size() == 0 || k.size() == 0) {
return false;
}
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self));
state std::pair<int64_t, TenantName> prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self));
// crossing tenant boundaries so finish the current block using only the tenant prefix of the new key
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache));
state std::pair<int64_t, TenantName> prevKeyTenantInfo =
wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache));
if (curKeyTenantInfo.first != prevKeyTenantInfo.first) {
CODE_PROBE(true, "crossed tenant boundaries");
wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo));
@ -1040,11 +1031,18 @@ private:
Key lastValue;
};
void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<Reference<TenantEntryCache<Void>>> tenantCache,
Optional<BlobCipherEncryptHeader> encryptHeader) {
// Read begin key, if this fails then block was invalid.
uint32_t kLen = reader->consumeNetworkUInt32();
const uint8_t* k = reader->consume(kLen);
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
state KeyRef prevKey = KeyRef(k, kLen);
state bool done = false;
state Optional<std::pair<int64_t, TenantName>> prevTenantInfo;
// Read kv pairs and end key
while (1) {
@ -1052,6 +1050,35 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
kLen = reader->consumeNetworkUInt32();
k = reader->consume(kLen);
// make sure that all keys in a block belong to exactly one tenant,
// unless its the last key in which case it can be a truncated (different) tenant prefix
if (encryptedBlock && g_network && g_network->isSimulated()) {
ASSERT(tenantCache.present());
ASSERT(encryptHeader.present());
state KeyRef curKey = KeyRef(k, kLen);
if (!prevTenantInfo.present()) {
std::pair<int64_t, TenantName> tenantInfo =
wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, tenantCache.get()));
prevTenantInfo = tenantInfo;
}
std::pair<int64_t, TenantName> curTenantInfo =
wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, tenantCache.get()));
if (!curKey.empty() && !prevKey.empty() && prevTenantInfo.get().first != curTenantInfo.first) {
ASSERT(!done);
if (curTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID &&
curTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(curKey.size() == TENANT_PREFIX_SIZE);
}
done = true;
}
// make sure that all keys (except possibly the last key) in a block are encrypted using the correct key
if (!prevKey.empty()) {
ASSERT(prevTenantInfo.get().first == encryptHeader.get().cipherTextDetails.encryptDomainId);
}
prevKey = curKey;
prevTenantInfo = curTenantInfo;
}
// If eof reached or first value len byte is 0xFF then a valid block end was reached.
if (reader->eof() || *reader->rptr == 0xFF) {
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
@ -1072,6 +1099,8 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
for (auto b : reader->remainder())
if (b != 0xFF)
throw restore_corrupted_data_padding();
return Void();
}
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1094,7 +1123,11 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
decodeKVPairs(&reader, &results);
wait(decodeKVPairs(&reader,
&results,
false,
Optional<Reference<TenantEntryCache<Void>>>(),
Optional<BlobCipherEncryptHeader>()));
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
@ -1108,7 +1141,8 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// read encryption header
const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
// calculate the total bytes read up to (and including) the header
int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
@ -1117,7 +1151,12 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
decodeKVPairs(&reader, &results);
state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
if (g_network && g_simulator->isSimulated()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx.get(), TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache.get()->init());
}
wait(decodeKVPairs(&reader, &results, true, tenantCache, header));
} else {
throw restore_unsupported_file_version();
}
@ -3398,6 +3437,8 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
state RestoreConfig restore(task);
restore.stateEnum().set(tr, ERestoreState::COMPLETED);
state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true));
tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue);
// Clear the file map now since it could be huge.
restore.fileSet().clear(tr);
@ -3413,7 +3454,9 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
restore.clearApplyMutationsKeys(tr);
wait(taskBucket->finish(tr, task));
wait(unlockDatabase(tr, restore.getUid()));
if (unlockDB) {
wait(unlockDatabase(tr, restore.getUid()));
}
return Void();
}
@ -5172,6 +5215,7 @@ public:
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -5245,6 +5289,7 @@ public:
restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs);
restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
restore.beginVersion().set(tr, beginVersion);
restore.unlockDBAfterRestore().set(tr, unlockDB);
if (BUGGIFY && restoreRanges.size() == 1) {
restore.restoreRange().set(tr, restoreRanges[0]);
} else {
@ -5836,6 +5881,7 @@ public:
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -5892,6 +5938,7 @@ public:
addPrefix,
removePrefix,
lockDB,
unlockDB,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -6017,7 +6064,7 @@ public:
}
}
Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
state Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
if (fastRestore) {
TraceEvent("AtomicParallelRestoreStartRestore").log();
@ -6043,24 +6090,80 @@ public:
return -1;
} else {
TraceEvent("AS_StartRestore").log();
Version ver = wait(restore(backupAgent,
cx,
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
ranges,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid));
state Standalone<VectorRef<KeyRangeRef>> restoreRange;
state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
bool encryptionEnabled = cx->clientInfo->get().isEncryptionEnabled;
for (auto r : ranges) {
if (!encryptionEnabled || !r.intersects(getSystemBackupRanges())) {
restoreRange.push_back_deep(restoreRange.arena(), r);
} else {
KeyRangeRef normalKeyRange = r & normalKeys;
KeyRangeRef systemKeyRange = r & systemKeys;
if (!normalKeyRange.empty()) {
restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
}
if (!systemKeyRange.empty()) {
systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
}
}
}
if (!systemRestoreRange.empty()) {
// restore system keys
wait(success(restore(backupAgent,
cx,
cx,
"system_restore"_sr,
KeyRef(bc->getURL()),
bc->getProxy(),
systemRestoreRange,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
UnlockDB::False,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid)));
state Reference<ReadYourWritesTransaction> rywTransaction =
Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
// clear old restore config associated with system keys
loop {
try {
rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE);
state RestoreConfig oldRestore(randomUid);
oldRestore.clear(rywTransaction);
wait(rywTransaction->commit());
break;
} catch (Error& e) {
wait(rywTransaction->onError(e));
}
}
}
// restore user data
state Version ver = wait(restore(backupAgent,
cx,
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
restoreRange,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
UnlockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid));
return ver;
}
}
@ -6120,6 +6223,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -6137,6 +6241,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
unlockDB,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -6178,6 +6283,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
UnlockDB::True,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,

View File

@ -2559,19 +2559,19 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
}
}
void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) {
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantName);
tr.set(key, BinaryWriter::toValue<uint64_t>(quota, Unversioned()));
tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
}
ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
if (!v.present()) {
return Optional<uint64_t>();
return Optional<int64_t>();
}
return BinaryReader::fromStringRef<uint64_t>(v.get(), Unversioned());
return BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned());
}
std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) {

View File

@ -1479,16 +1479,6 @@ Future<RangeResult> HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction*
return healthMetricsGetRangeActor(ryw, kr);
}
KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) {
if (prefix.empty()) {
return range;
} else {
KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin;
KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end;
return KeyRangeRef(begin, end);
}
}
ACTOR Future<UID> getClusterId(Database db) {
while (!db->clientInfo->get().clusterId.isValid()) {
wait(db->clientInfo->onChange());
@ -1925,7 +1915,8 @@ Optional<KeyRangeLocationInfo> DatabaseContext::getCachedLocation(const Optional
auto range =
isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey);
if (range->value()) {
return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value());
return KeyRangeLocationInfo(
tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value());
}
return Optional<KeyRangeLocationInfo>();
@ -1962,7 +1953,8 @@ bool DatabaseContext::getCachedLocations(const Optional<TenantNameRef>& tenantNa
result.clear();
return false;
}
result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
result.emplace_back(
tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value());
if (result.size() == limit || begin == end) {
break;
}
@ -2978,7 +2970,7 @@ ACTOR Future<KeyRangeLocationInfo> getKeyLocation_internal(Database cx,
return KeyRangeLocationInfo(
rep.tenantEntry,
KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
locationInfo);
}
}
@ -3123,7 +3115,7 @@ ACTOR Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations_internal(
// efficient to save the map pairs and insert them all at once.
results.emplace_back(
rep.tenantEntry,
(toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
(toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
cx->setCachedLocation(
tenant.name, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second));
wait(yield());
@ -6558,7 +6550,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
e.code() != error_code_grv_proxy_memory_limit_exceeded &&
e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled &&
e.code() != error_code_process_behind && e.code() != error_code_future_version &&
e.code() != error_code_tenant_not_found) {
e.code() != error_code_tenant_not_found && e.code() != error_code_proxy_tag_throttled) {
TraceEvent(SevError, "TryCommitError").error(e);
}
if (trState->trLogInfo)
@ -6999,6 +6991,8 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
&GrvProxyInterface::getConsistentReadVersion,
req,
cx->taskID))) {
CODE_PROBE(v.proxyTagThrottledDuration > 0.0,
"getConsistentReadVersion received GetReadVersionReply delayed by proxy tag throttling");
if (tags.size() != 0) {
auto& priorityThrottledTags = cx->throttledTags[priority];
for (auto& tag : tags) {
@ -7033,7 +7027,7 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanContext parentSpa
}
} catch (Error& e) {
if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled &&
e.code() != error_code_grv_proxy_memory_limit_exceeded)
e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_proxy_tag_throttled)
TraceEvent(SevError, "GetConsistentReadVersionError").error(e);
if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) {
wait(delayJittered(5.0));
@ -7484,7 +7478,7 @@ Future<Void> Transaction::onError(Error const& e) {
e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded ||
e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind ||
e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled ||
e.code() == error_code_blob_granule_request_failed) {
e.code() == error_code_blob_granule_request_failed || e.code() == error_code_proxy_tag_throttled) {
if (e.code() == error_code_not_committed)
++trState->cx->transactionsNotCommitted;
else if (e.code() == error_code_commit_unknown_result)
@ -7724,6 +7718,35 @@ ACTOR Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(Da
}
}
ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
KeyRange keys,
std::vector<KeyRangeLocationInfo> locations,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError) {
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return x;
} catch (Error& e) {
TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
}
return Optional<StorageMetrics>();
}
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
Database cx,
KeyRange keys,
@ -7753,38 +7776,26 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
}
// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
// solution to this.
if (locations.size() < shardLimit) {
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return std::make_pair(x, -1);
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
} else {
// solution to this. How could this happen?
if (locations.size() >= shardLimit) {
TraceEvent(SevWarn, "WaitStorageMetricsPenalty")
.detail("Keys", keys)
.detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT)
.detail("Limit", shardLimit)
.detail("LocationSize", locations.size())
.detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY);
wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
// make sure that the next getKeyRangeLocations() call will actually re-fetch the range
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
continue;
}
Optional<StorageMetrics> res =
wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
}
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
@ -8645,6 +8656,56 @@ Future<Void> DatabaseContext::splitStorageMetricsStream(const PromiseStream<Key>
resultStream, Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated, minSplitBytes);
}
ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
std::vector<KeyRangeLocationInfo> locations,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated,
Optional<int> minSplitBytes) {
state StorageMetrics used;
state Standalone<VectorRef<KeyRef>> results;
results.push_back_deep(results.arena(), keys.begin);
//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
try {
state int i = 0;
for (; i < locations.size(); i++) {
SplitMetricsRequest req(
locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
TaskPriority::DataDistribution));
if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly
// because of moving data, throw error to retry
ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}
if (res.splits.size()) {
results.append(results.arena(), res.splits.begin(), res.splits.size());
results.arena().dependsOn(res.splits.arena());
}
used = res.used;
//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
}
if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) {
results.resize(results.arena(), results.size() - 1);
}
if (keys.end <= locations.back().range.end) {
results.push_back_deep(results.arena(), keys.end);
}
return results;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "SplitStorageMetricsError").error(e);
throw;
}
}
return Optional<Standalone<VectorRef<KeyRef>>>();
}
ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
KeyRange keys,
StorageMetrics limit,
@ -8663,61 +8724,24 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
Optional<UID>(),
UseProvisionalProxies::False,
latestVersion));
state StorageMetrics used;
state Standalone<VectorRef<KeyRef>> results;
// SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better
// solution to this.
if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(Key(), keys);
} else {
results.push_back_deep(results.arena(), keys.begin);
try {
//TraceEvent("SplitStorageMetrics").detail("Locations", locations.size());
state int i = 0;
for (; i < locations.size(); i++) {
SplitMetricsRequest req(
locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
TaskPriority::DataDistribution));
if (res.splits.size() &&
res.splits[0] <= results.back()) { // split points are out of order, possibly because of
// moving data, throw error to retry
ASSERT_WE_THINK(
false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}
if (res.splits.size()) {
results.append(results.arena(), res.splits.begin(), res.splits.size());
results.arena().dependsOn(res.splits.arena());
}
used = res.used;
//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
}
if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) &&
results.size() > 1) {
results.resize(results.arena(), results.size() - 1);
}
if (keys.end <= locations.back().range.end) {
results.push_back_deep(results.arena(), keys.end);
}
return results;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "SplitStorageMetricsError").error(e);
throw;
}
cx->invalidateCache(Key(), keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
continue;
}
Optional<Standalone<VectorRef<KeyRef>>> results =
wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
if (results.present()) {
return results.get();
}
cx->invalidateCache(Key(), keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
@ -10540,6 +10564,76 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
}
// BlobGranule API.
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) {
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state RangeResult results = wait(
krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
}
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobbifiedRanges(Transaction* tr,
KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName) {
state TenantMapEntry tme;
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
}
ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
KeyRange range,
Version purgeVersion,
@ -10582,10 +10676,13 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
}
// must be aligned to blob range(s)
state Future<Optional<Value>> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin));
state Future<Optional<Value>> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin));
wait(success(beginPresent) && success(endPresent));
if (!beginPresent.get().present() || !endPresent.get().present()) {
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {});
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {});
wait(success(blobbifiedBegin) && success(blobbifiedEnd));
if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
(!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
TraceEvent("UnalignedPurge")
.detail("Range", range)
.detail("Version", purgeVersion)
@ -10662,39 +10759,6 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYourWritesTransaction> tr,
KeyRange range,
int batchLimit) {
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state RangeResult results = wait(
krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
}
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
KeyRange range,
bool active,
@ -10716,7 +10780,7 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
range = range.withPrefix(tenantEntry.prefix);
}
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 1));
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));
if (active) {
// Idempotent request.
@ -10764,47 +10828,19 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state TenantMapEntry tme;
state Transaction tr(db);
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
Standalone<VectorRef<KeyRangeRef>> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName));
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
return blobbifiedRanges;
}
Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
int rowLimit,
int rangeLimit,
Optional<TenantName> tenantName) {
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit, tenantName);
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
}
int64_t getMaxKeySize(KeyRef const& key) {

View File

@ -297,7 +297,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false );
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
// TeamRemover
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -726,8 +727,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0;
init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false;
init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10);
init( GLOBAL_TAG_THROTTLING, false );
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false );
init( GLOBAL_TAG_THROTTLING, false ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip();
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING );
init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 );
init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 );
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
@ -966,6 +967,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
init( BG_KEY_TUPLE_TRUNCATE_OFFSET, 0 );
init( BG_ENABLE_READ_DRIVEN_COMPACTION, true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false;
init( BG_RDC_BYTES_FACTOR, 2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_RDC_READ_FACTOR, 3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
@ -974,6 +978,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
init( BLOB_WORKER_RDC_PARALLELISM, 2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 );

View File

@ -579,8 +579,8 @@ public:
int maxConcurrentTasks) {
state Reference<AsyncVar<bool>> paused = makeReference<AsyncVar<bool>>(true);
state Future<Void> watchPausedFuture = watchPaused(cx, taskBucket, paused);
taskBucket->metricLogger = traceCounters(
"TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY, &taskBucket->cc);
taskBucket->metricLogger = taskBucket->cc.traceCounters(
"TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY);
loop {
while (paused->get()) {
wait(paused->onChange() || watchPausedFuture);

View File

@ -196,6 +196,7 @@ public:
Key addPrefix = Key(),
Key removePrefix = Key(),
LockDB = LockDB::True,
UnlockDB = UnlockDB::True,
OnlyApplyMutationLogs = OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
Version beginVersion = ::invalidVersion,

View File

@ -45,6 +45,7 @@ struct BlobWorkerStats {
Counter compressionBytesFinal;
Counter fullRejections;
Counter forceFlushCleanups;
Counter readDrivenCompactions;
int numRangesAssigned;
int mutationBytesBuffered;
@ -83,10 +84,11 @@ struct BlobWorkerStats {
readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0),
activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0),
notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
deltaWritesLock(deltaWritesLock) {
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
@ -103,8 +105,8 @@ struct BlobWorkerStats {
specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });
logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics");
logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics");
}
};
#endif
#endif

View File

@ -262,6 +262,8 @@ public:
double TAG_THROTTLE_EXPIRATION_INTERVAL;
int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations
int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
// being rejected
// busyness reporting
double BUSYNESS_SPIKE_START_THRESHOLD;

View File

@ -336,12 +336,13 @@ struct KeyRangeRef {
bool isCovered(std::vector<KeyRangeRef>& ranges) {
ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder()));
KeyRangeRef clone(begin, end);
for (auto r : ranges) {
if (begin < r.begin)
if (clone.begin < r.begin)
return false; // uncovered gap between clone.begin and r.begin
if (end <= r.end)
if (clone.end <= r.end)
return true; // range is fully covered
if (end > r.begin)
if (clone.end > r.begin)
// {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end}
clone = KeyRangeRef(r.end, clone.end);
}
@ -589,6 +590,8 @@ inline KeyRange prefixRange(KeyRef prefix) {
// The returned reference is valid as long as keys is valid.
KeyRef keyBetween(const KeyRangeRef& keys);
KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix);
struct KeySelectorRef {
private:
KeyRef key; // Find the last item less than key

View File

@ -0,0 +1,48 @@
/*
* KeyLocationService.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FOUNDATIONDB_KEYLOCATIONSERVICE_H
#define FOUNDATIONDB_KEYLOCATIONSERVICE_H
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/DatabaseContext.h"
class IKeyLocationService {
// If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
// Otherwise returns the shard containing key. It's possible the returned location is a failed interface.
virtual Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
Key key,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Reverse isBackward,
Version version) = 0;
virtual Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
KeyRange keys,
int limit,
Reverse reverse,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Version version) = 0;
};
#endif // FOUNDATIONDB_KEYLOCATIONSERVICE_H

View File

@ -164,8 +164,8 @@ bool schemaMatch(json_spirit::mValue const& schema,
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Set and get the storage quota per tenant
void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota);
ACTOR Future<Optional<uint64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
#include "flow/unactorcompiler.h"
#endif

View File

@ -591,6 +591,26 @@ int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess);
// Returns the maximum legal size of a key that can be cleared. Keys larger than this will be assumed not to exist.
int64_t getMaxClearKeySize(KeyRef const& key);
struct KeyRangeLocationInfo;
// Return the aggregated StorageMetrics of range keys to the caller. The locations tell which interface should
// serve the request. The final result is within (min-permittedError/2, max + permittedError/2) if valid.
ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo tenantInfo,
KeyRange keys,
std::vector<KeyRangeLocationInfo> locations,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError);
// Return the suggested split points from storage server.The locations tell which interface should
// serve the request. `limit` is the current estimated storage metrics of `keys`.The returned points, if present,
// guarantee the metrics of split result is within limit.
ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLocations(
std::vector<KeyRangeLocationInfo> locations,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated,
Optional<int> minSplitBytes);
namespace NativeAPI {
ACTOR Future<std::vector<std::pair<StorageServerInterface, ProcessClass>>> getServerListAndProcessClasses(
Transaction* tr);

View File

@ -237,8 +237,10 @@ public:
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED;
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is
// refreshed
int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
// in the TenantCache
int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is
// refreshed in the TenantCache
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -948,10 +950,14 @@ public:
int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
int BG_MERGE_CANDIDATE_DELAY_SECONDS;
int BG_KEY_TUPLE_TRUNCATE_OFFSET;
bool BG_ENABLE_READ_DRIVEN_COMPACTION;
int BG_RDC_BYTES_FACTOR;
int BG_RDC_READ_FACTOR;
int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
int BLOB_WORKER_RDC_PARALLELISM;
double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout

View File

@ -68,6 +68,10 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
// 1. Lookup by 'TenantId'
// 2. Lookup by 'TenantPrefix'
// 3. Lookup by 'TenantName'
// TODO: Currently this cache performs poorly if there are tenant access happening to unknown tenants which happens most
// frequently in optional tenant mode but can also happen in required mode if there are alot of tenants created. Further
// as a consequence of the design we cannot be sure that the state of a given tenant is accurate even if its present in
// the cache.
template <class T>
class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {

View File

@ -273,17 +273,4 @@ struct ITracer {
virtual void trace(Span const& span) = 0;
};
void openTracer(TracerType type);
template <class T>
struct SpannedDeque : Deque<T> {
Span span;
explicit SpannedDeque(Location loc) : span(loc) {}
SpannedDeque(SpannedDeque&& other) : Deque<T>(std::move(other)), span(std::move(other.span)) {}
SpannedDeque(SpannedDeque const&) = delete;
SpannedDeque& operator=(SpannedDeque const&) = delete;
SpannedDeque& operator=(SpannedDeque&& other) {
*static_cast<Deque<T>*>(this) = std::move(other);
span = std::move(other.span);
}
};
void openTracer(TracerType type);

View File

@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
strip_debug_symbols(fdbmonitor)
assert_no_version_h(fdbmonitor)
if(UNIX AND NOT APPLE)
target_link_libraries(fdbmonitor PRIVATE rt)
target_link_libraries(fdbmonitor PRIVATE rt)
endif()
# FIXME: This include directory is an ugly hack. We probably want to fix this.
# as soon as we get rid of the old build system
@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads)
# appears to change its behavior (it no longer seems to restart killed
# processes). fdbmonitor is single-threaded anyway.
get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options})
endif ()
endif()
get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options})
endif ()
endif()
if(GENERATE_DEBUG_PACKAGES)
fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
@ -51,7 +50,7 @@ add_custom_target(clean_sandbox
add_custom_target(start_sandbox
COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
add_dependencies(start_sandbox fdbmonitor fdbserver)
@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
endif()
add_custom_target(generate_profile
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)

View File

@ -24,8 +24,8 @@
Counter::Counter(std::string const& name, CounterCollection& collection)
: name(name), interval_start(0), last_event(0), interval_sq_time(0), roughness_interval_start(0), interval_delta(0),
interval_start_value(0) {
metric.init(collection.name + "." + (char)toupper(name.at(0)) + name.substr(1), collection.id);
collection.counters.push_back(this);
metric.init(collection.getName() + "." + (char)toupper(name.at(0)) + name.substr(1), collection.getId());
collection.addCounter(this);
}
void Counter::operator+=(Value delta) {
@ -88,36 +88,48 @@ void CounterCollection::logToTraceEvent(TraceEvent& te) const {
}
}
ACTOR Future<Void> traceCounters(std::string traceEventName,
UID traceEventID,
double interval,
CounterCollection* counters,
std::string trackLatestName,
std::function<void(TraceEvent&)> decorator) {
wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
class CounterCollectionImpl {
public:
ACTOR static Future<Void> traceCounters(CounterCollection* counters,
std::string traceEventName,
UID traceEventID,
double interval,
std::string trackLatestName,
std::function<void(TraceEvent&)> decorator) {
wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized
for (ICounter* c : counters->counters)
c->resetInterval();
state Reference<EventCacheHolder> traceEventHolder;
if (!trackLatestName.empty()) {
traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
}
state double last_interval = now();
loop {
TraceEvent te(traceEventName.c_str(), traceEventID);
te.detail("Elapsed", now() - last_interval);
counters->logToTraceEvent(te);
decorator(te);
for (ICounter* c : counters->counters)
c->resetInterval();
state Reference<EventCacheHolder> traceEventHolder;
if (!trackLatestName.empty()) {
te.trackLatest(traceEventHolder->trackingKey);
traceEventHolder = makeReference<EventCacheHolder>(trackLatestName);
}
last_interval = now();
wait(delay(interval, TaskPriority::FlushTrace));
state double last_interval = now();
loop {
TraceEvent te(traceEventName.c_str(), traceEventID);
te.detail("Elapsed", now() - last_interval);
counters->logToTraceEvent(te);
decorator(te);
if (!trackLatestName.empty()) {
te.trackLatest(traceEventHolder->trackingKey);
}
last_interval = now();
wait(delay(interval, TaskPriority::FlushTrace));
}
}
};
Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
UID traceEventID,
double interval,
std::string const& trackLatestName,
std::function<void(TraceEvent&)> const& decorator) {
return CounterCollectionImpl::traceCounters(
this, traceEventName, traceEventID, interval, trackLatestName, decorator);
}

View File

@ -67,17 +67,37 @@ struct Traceable<ICounter*> : std::true_type {
}
};
struct CounterCollection {
CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {}
std::vector<struct ICounter*> counters, counters_to_remove;
~CounterCollection() {
for (auto c : counters_to_remove)
c->remove();
}
class CounterCollection {
friend class CounterCollectionImpl;
std::string name;
std::string id;
std::vector<struct ICounter*> counters, countersToRemove;
public:
CounterCollection(std::string const& name, std::string const& id = std::string()) : name(name), id(id) {}
~CounterCollection() {
for (auto c : countersToRemove)
c->remove();
}
void addCounter(ICounter* counter) { counters.push_back(counter); }
// Call remove method on this counter in ~CounterCollection
void markForRemoval(ICounter* counter) { countersToRemove.push_back(counter); }
std::string const& getName() const { return name; }
std::string const& getId() const { return id; }
void logToTraceEvent(TraceEvent& te) const;
Future<Void> traceCounters(
std::string const& traceEventName,
UID traceEventID,
double interval,
std::string const& trackLatestName = std::string(),
std::function<void(TraceEvent&)> const& decorator = [](auto& te) {});
};
struct Counter final : ICounter, NonCopyable {
@ -131,8 +151,8 @@ struct Traceable<Counter> : std::true_type {
template <class F>
struct SpecialCounter final : ICounter, FastAllocated<SpecialCounter<F>>, NonCopyable {
SpecialCounter(CounterCollection& collection, std::string const& name, F&& f) : name(name), f(f) {
collection.counters.push_back(this);
collection.counters_to_remove.push_back(this);
collection.addCounter(this);
collection.markForRemoval(this);
}
void remove() override { delete this; }
@ -162,14 +182,6 @@ static void specialCounter(CounterCollection& collection, std::string const& nam
new SpecialCounter<F>(collection, name, std::move(f));
}
Future<Void> traceCounters(
std::string const& traceEventName,
UID const& traceEventID,
double const& interval,
CounterCollection* const& counters,
std::string const& trackLatestName = std::string(),
std::function<void(TraceEvent&)> const& decorator = [](TraceEvent& te) {});
class LatencyBands {
public:
LatencyBands(std::string name, UID id, double loggingInterval)
@ -180,7 +192,7 @@ public:
if (bands.size() == 0) {
ASSERT(!cc && !filteredCount);
cc = std::make_unique<CounterCollection>(name, id.toString());
logger = traceCounters(name, id, loggingInterval, cc.get(), id.toString() + "/" + name);
logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name);
filteredCount = std::make_unique<Counter>("Filtered", *cc);
insertBand(std::numeric_limits<double>::infinity());
}

View File

@ -42,8 +42,6 @@ struct TenantInfo {
// Is set during deserialization. It will be set to true if the tenant
// name is set and the client is authorized to use this tenant.
bool tenantAuthorized = false;
// Number of storage bytes currently used by this tenant.
int64_t storageUsage = 0;
// Helper function for most endpoints that read/write data. This returns true iff
// the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant,

View File

@ -290,8 +290,8 @@ struct BackupData {
specialCounter(cc, "MsgQ", [this]() { return this->messages.size(); });
specialCounter(cc, "BufferedBytes", [this]() { return this->lock->activePermits(); });
specialCounter(cc, "AvailableBytes", [this]() { return this->lock->available(); });
logger = traceCounters(
"BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "BackupWorkerMetrics");
logger =
cc.traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "BackupWorkerMetrics");
}
bool pullFinished() const { return endVersion.present() && pulledVersion.get() > endVersion.get(); }

View File

@ -296,7 +296,7 @@ struct BlobManagerStats {
specialCounter(cc, "HardBoundaries", [mergeHardBoundaries]() { return mergeHardBoundaries->size(); });
specialCounter(cc, "SoftBoundaries", [mergeBoundaries]() { return mergeBoundaries->size(); });
specialCounter(cc, "BlockedAssignments", [this]() { return this->blockedAssignments; });
logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
logger = cc.traceCounters("BlobManagerMetrics", id, interval, "BlobManagerMetrics");
}
};
@ -3537,7 +3537,7 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
}
// skip the rest of the algorithm for the first blob manager
if (bmData->epoch == 1) {
if (bmData->epoch == 1 && !isFullRestoreMode()) {
bmData->doneRecovering.send(Void());
return Void();
}

View File

@ -26,6 +26,7 @@
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbserver/Knobs.h"
#include "flow/FastRef.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/BlobConnectionProvider.h"
@ -189,23 +190,6 @@ private:
static const int sMaxCount_{ 5 }; // max number of manifest file to keep
};
// Defines granule info that interests full restore
struct BlobGranuleVersion {
// Two constructors required by VectorRef
BlobGranuleVersion() {}
BlobGranuleVersion(Arena& a, const BlobGranuleVersion& copyFrom)
: granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
sizeInBytes(copyFrom.sizeInBytes) {}
UID granuleID;
KeyRangeRef keyRange;
Version version;
int64_t sizeInBytes;
};
// Defines a vector for BlobGranuleVersion
typedef Standalone<VectorRef<BlobGranuleVersion>> BlobGranuleVersionVector;
// Defines filename, version, size for each granule file that interests full restore
struct GranuleFileVersion {
Version version;
@ -226,16 +210,53 @@ public:
Value data = wait(readFromFile(self));
Standalone<BlobManifest> manifest = decode(data);
wait(writeSystemKeys(self, manifest.rows));
BlobGranuleVersionVector _ = wait(listGranules(self));
BlobGranuleRestoreVersionVector _ = wait(listGranules(self));
} catch (Error& e) {
dprint("WARNING: unexpected manifest loader error {}\n", e.what()); // skip error handling so far
}
return Void();
}
// Iterate active granules and return their version/sizes
ACTOR static Future<BlobGranuleRestoreVersionVector> listGranules(Reference<BlobManifestLoader> self) {
state Transaction tr(self->db_);
loop {
state BlobGranuleRestoreVersionVector results;
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
std::vector<KeyRangeRef> granules;
state int i = 0;
auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
for (i = 0; i < blobRanges.size() - 1; i++) {
Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
try {
Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange));
results.push_back_deep(results.arena(), granule);
} catch (Error& e) {
if (e.code() == error_code_restore_missing_data) {
dprint("missing data for key range {} \n", granuleRange.toString());
TraceEvent("BlobRestoreMissingData").detail("KeyRange", granuleRange.toString());
} else {
throw;
}
}
}
return results;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Print out a summary for blob granules
ACTOR static Future<Void> print(Reference<BlobManifestLoader> self) {
state BlobGranuleVersionVector granules = wait(listGranules(self));
state BlobGranuleRestoreVersionVector granules = wait(listGranules(self));
for (auto granule : granules) {
wait(checkGranuleFiles(self, granule));
}
@ -285,41 +306,9 @@ private:
}
}
// Iterate active granules and return their version/sizes
ACTOR static Future<BlobGranuleVersionVector> listGranules(Reference<BlobManifestLoader> self) {
state Transaction tr(self->db_);
loop {
state BlobGranuleVersionVector results;
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
std::vector<KeyRangeRef> granules;
state int i = 0;
auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
for (i = 0; i < blobRanges.size() - 1; i++) {
Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
try {
Standalone<BlobGranuleVersion> granule = wait(getGranule(&tr, granuleRange));
results.push_back_deep(results.arena(), granule);
} catch (Error& e) {
dprint("missing data for key range {} \n", granuleRange.toString());
}
}
return results;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Find the newest granule for a key range. The newest granule has the max version and relevant files
ACTOR static Future<Standalone<BlobGranuleVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
state Standalone<BlobGranuleVersion> granuleVersion;
ACTOR static Future<Standalone<BlobGranuleRestoreVersion>> getGranule(Transaction* tr, KeyRangeRef range) {
state Standalone<BlobGranuleRestoreVersion> granuleVersion;
KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range);
// reverse lookup so that the first row is the newest version
state RangeResult results =
@ -389,7 +378,7 @@ private:
}
// Read data from granules and print out summary
ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleVersion granule) {
ACTOR static Future<Void> checkGranuleFiles(Reference<BlobManifestLoader> self, BlobGranuleRestoreVersion granule) {
state KeyRangeRef range = granule.keyRange;
state Version readVersion = granule.version;
state Transaction tr(self->db_);
@ -441,3 +430,11 @@ ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProv
wait(BlobManifestLoader::print(loader));
return Void();
}
// API to list blob granules
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db,
Reference<BlobConnectionProvider> blobConn) {
Reference<BlobManifestLoader> loader = makeReference<BlobManifestLoader>(db, blobConn);
BlobGranuleRestoreVersionVector result = wait(BlobManifestLoader::listGranules(loader));
return result;
}

View File

@ -30,54 +30,312 @@
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/ServerDBInfo.actor.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "flow/actorcompiler.h" // has to be last include
#include "flow/network.h"
#include <algorithm>
#include <string>
#define ENABLE_DEBUG_MG true
template <typename... T>
static inline void dprint(fmt::format_string<T...> fmt, T&&... args) {
if (ENABLE_DEBUG_MG)
fmt::print(fmt, std::forward<T>(args)...);
}
// BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of
// StorageServerInterface APIs which are needed for DataDistributor to start data migration.
class BlobMigrator : public NonCopyable, public ReferenceCounted<BlobMigrator> {
public:
BlobMigrator(Reference<AsyncVar<ServerDBInfo> const> dbInfo, BlobMigratorInterface interf)
: blobMigratorInterf(interf), actors(false) {
if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
: interf_(interf), actors_(false) {
if (!blobConn_.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
blobConn_ = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
}
db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
db_ = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
}
~BlobMigrator() {}
// Start migration
ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture()));
if (!isFullRestoreMode()) {
return Void();
}
wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
self->blobGranules_ = granules;
wait(prepare(self, normalKeys));
wait(serverLoop(self));
return Void();
}
private:
// Prepare for data migration for given key range.
ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
// Register as a storage server, so that DataDistributor could start data movement after
std::pair<Version, Tag> verAndTag = wait(addStorageServer(self->db_, self->interf_.ssi));
dprint("Started storage server interface {} {}\n", verAndTag.first, verAndTag.second.toString());
// Reassign key ranges to the storage server
// It'll restart DataDistributor so that internal data structures like ShardTracker, ShardsAffectedByTeamFailure
// could be re-initialized. Ideally it should be done within DataDistributor, then we don't need to
// restart DataDistributor
state int oldMode = wait(setDDMode(self->db_, 0));
wait(unassignServerKeys(self, keys));
wait(assignKeysToServer(self, keys, self->interf_.ssi.id()));
wait(success(setDDMode(self->db_, oldMode)));
return Void();
}
// Assign given key range to specified storage server. Subsquent
ACTOR static Future<Void> assignKeysToServer(Reference<BlobMigrator> self, KeyRangeRef keys, UID serverUID) {
state Transaction tr(self->db_);
loop {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) {
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID);
break;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
state Value value = keyServersValue(std::vector<UID>({ serverUID }), std::vector<UID>(), UID(), UID());
wait(krmSetRange(&tr, keyServersPrefix, keys, value));
wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue));
wait(tr.commit());
dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Unassign given key range from its current storage servers
ACTOR static Future<Void> unassignServerKeys(Reference<BlobMigrator> self, KeyRangeRef keys) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
for (auto& server : serverList) {
state UID id = decodeServerListValue(server.value).id();
RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), keys));
bool owning = false;
for (auto& r : ranges) {
if (r.value == serverKeysTrue) {
owning = true;
break;
}
}
if (owning) {
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
}
}
when(wait(self->actors.getResult())) {}
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self));
loop {
try {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
break;
}
when(wait(self->actors_.getResult())) {}
}
} catch (Error& e) {
dprint("Unexpected serverLoop error {}\n", e.what());
throw;
}
}
return Void();
}
// Handle StorageServerInterface APIs
ACTOR static Future<Void> handleRequest(Reference<BlobMigrator> self) {
state StorageServerInterface ssi = self->interf_.ssi;
loop {
try {
choose {
when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) {
dprint("Handle GetShardStateRequest\n");
Version version = maxVersion(self);
GetShardStateReply rep(version, version);
req.reply.send(rep); // return empty shards
}
when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
// dprint("Handle WaitMetricsRequest\n");
self->actors_.add(processWaitMetricsRequest(self, req));
}
when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
dprint("Handle SplitMetrics {}\n", req.keys.toString());
SplitMetricsReply rep;
for (auto granule : self->blobGranules_) {
// TODO: Use granule boundary as split point. A better approach is to split by size
if (granule.keyRange.begin > req.keys.begin && granule.keyRange.end < req.keys.end)
rep.splits.push_back_deep(rep.splits.arena(), granule.keyRange.begin);
}
req.reply.send(rep);
}
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
fmt::print("Handle GetStorageMetrics\n");
StorageMetrics metrics;
metrics.bytes = sizeInBytes(self);
GetStorageMetricsReply resp;
resp.load = metrics;
req.reply.send(resp);
}
when(ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture())) {
dprint("Handle KeyValueStoreType\n");
reply.send(KeyValueStoreType::MEMORY);
}
}
} catch (Error& e) {
dprint("Unexpected blob migrator request error {}\n", e.what());
throw;
}
}
}
// Handle StorageServerInterface APIs that are not supported. Simply log and return error
ACTOR static Future<Void> handleUnsupportedRequest(Reference<BlobMigrator> self) {
state StorageServerInterface ssi = self->interf_.ssi;
loop {
try {
choose {
when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
dprint("Unsupported SplitRangeRequest\n");
req.reply.sendError(unsupported_operation());
}
when(StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) {
self->actors_.add(processStorageQueuingMetricsRequest(req));
}
when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
dprint("Unsupported ReadHotSubRange\n");
req.reply.sendError(unsupported_operation());
}
when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) {
dprint("Unsupported GetKeyValuesStreamRequest\n");
req.reply.sendError(unsupported_operation());
}
when(GetKeyRequest req = waitNext(ssi.getKey.getFuture())) {
dprint("Unsupported GetKeyRequest\n");
req.reply.sendError(unsupported_operation());
}
when(GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture())) {
/* dprint("Unsupported GetKeyValuesRequest {} - {} @ {}\n",
req.begin.getKey().printable(),
req.end.getKey().printable(),
req.version); */
req.reply.sendError(unsupported_operation());
}
when(GetValueRequest req = waitNext(ssi.getValue.getFuture())) {
dprint("Unsupported GetValueRequest\n");
req.reply.sendError(unsupported_operation());
}
when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) {
dprint("Unsupported GetCheckpoint \n");
req.reply.sendError(unsupported_operation());
}
when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) {
dprint("Unsupported FetchCheckpointRequest\n");
req.reply.sendError(unsupported_operation());
}
when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
dprint("Unsupported UpdateCommitCostRequest\n");
req.reply.sendError(unsupported_operation());
}
when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
dprint("Unsupported FetchCheckpointKeyValuesRequest\n");
req.reply.sendError(unsupported_operation());
}
}
} catch (Error& e) {
dprint("Unexpected request handling error {}\n", e.what());
throw;
}
}
}
ACTOR static Future<Void> processWaitMetricsRequest(Reference<BlobMigrator> self, WaitMetricsRequest req) {
state WaitMetricsRequest waitMetricsRequest = req;
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
wait(delay(1));
StorageMetrics metrics;
metrics.bytes = sizeInBytes(self, waitMetricsRequest.keys);
waitMetricsRequest.reply.send(metrics);
return Void();
}
ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
wait(delay(1));
req.reply.sendError(unsupported_operation());
return Void();
}
// Return total storage size in bytes for migration
static int64_t sizeInBytes(Reference<BlobMigrator> self) { return sizeInBytes(self, normalKeys); }
// Return storage size in bytes for given key range
static int64_t sizeInBytes(Reference<BlobMigrator> self, KeyRangeRef range) {
int64_t bytes = 0;
for (auto granule : self->blobGranules_) {
if (range.intersects(granule.keyRange))
bytes += granule.sizeInBytes;
}
return bytes;
}
// Return max version for all blob granules
static Version maxVersion(Reference<BlobMigrator> self) {
Version max = 0;
for (auto granule : self->blobGranules_) {
max = std::max(granule.version, max);
}
return max;
}
private:
Database db;
Reference<BlobConnectionProvider> blobConn;
BlobMigratorInterface blobMigratorInterf;
ActorCollection actors;
Database db_;
Reference<BlobConnectionProvider> blobConn_;
BlobGranuleRestoreVersionVector blobGranules_;
BlobMigratorInterface interf_;
ActorCollection actors_;
};
// Main entry point
ACTOR Future<Void> blobMigrator(BlobMigratorInterface ssi, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", ssi.id().toString());
ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", interf.id().toString());
try {
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, ssi);
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
wait(BlobMigrator::start(self));
} catch (Error& e) {
fmt::print("unexpected blob migrator error {}\n", e.what());
dprint("Unexpected blob migrator error {}\n", e.what());
TraceEvent("BlobMigratorError", interf.id()).error(e);
}
return Void();
}

View File

@ -84,6 +84,15 @@ struct GranuleStartState {
Optional<GranuleHistory> history;
};
// TODO: add more (blob file request cost, in-memory mutations vs blob delta file, etc...)
struct GranuleReadStats {
int64_t deltaBytesRead;
void reset() { deltaBytesRead = 0; }
GranuleReadStats() { reset(); }
};
struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
KeyRange keyRange;
@ -120,11 +129,74 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
AssignBlobRangeRequest originalReq;
GranuleReadStats readStats;
bool rdcCandidate;
Promise<Void> runRDC;
void resume() {
if (resumeSnapshot.canBeSet()) {
resumeSnapshot.send(Void());
}
}
void resetReadStats() {
rdcCandidate = false;
readStats.reset();
runRDC.reset();
}
// determine eligibility (>1) and priority for re-snapshotting this granule
double weightRDC() {
// ratio of read amp to write amp that would be incurred by re-snapshotting now
int64_t lastSnapshotSize = (files.snapshotFiles.empty()) ? 0 : files.snapshotFiles.back().length;
int64_t minSnapshotSize = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2;
lastSnapshotSize = std::max(minSnapshotSize, lastSnapshotSize);
int64_t writeAmp = lastSnapshotSize + bufferedDeltaBytes + bytesInNewDeltaFiles;
// read amp is deltaBytesRead. Read amp must be READ_FACTOR times larger than write amp
return (1.0 * readStats.deltaBytesRead) / (writeAmp * SERVER_KNOBS->BG_RDC_READ_FACTOR);
}
bool isEligibleRDC() {
// granule should be reasonably read-hot to be eligible
int64_t bytesWritten = bufferedDeltaBytes + bytesInNewDeltaFiles;
return bytesWritten * SERVER_KNOBS->BG_RDC_READ_FACTOR < readStats.deltaBytesRead;
}
bool updateReadStats(Version readVersion, const BlobGranuleChunkRef& chunk) {
// Only update stats for re-compacting for at-latest reads that have to do snapshot + delta merge
if (!SERVER_KNOBS->BG_ENABLE_READ_DRIVEN_COMPACTION || !chunk.snapshotFile.present() ||
pendingSnapshotVersion != durableSnapshotVersion.get() || readVersion <= pendingSnapshotVersion) {
return false;
}
if (chunk.newDeltas.empty() && chunk.deltaFiles.empty()) {
return false;
}
readStats.deltaBytesRead += chunk.newDeltas.expectedSize();
for (auto& it : chunk.deltaFiles) {
readStats.deltaBytesRead += it.length;
}
if (rdcCandidate) {
return false;
}
if (isEligibleRDC() && weightRDC() > 1.0) {
rdcCandidate = true;
CODE_PROBE(true, "Granule read triggering read-driven compaction");
if (BW_DEBUG) {
fmt::print("Triggering read-driven compaction of [{0} - {1})\n",
keyRange.begin.printable(),
keyRange.end.printable());
}
return true;
}
return false;
}
inline bool doReadDrivenCompaction() { return runRDC.isSet(); }
};
struct GranuleRangeMetadata {
@ -200,6 +272,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
NotifiedVersion grvVersion;
Promise<Void> fatalError;
Promise<Void> simInjectFailure;
Promise<Void> doReadDrivenCompaction;
Reference<FlowLock> initialSnapshotLock;
Reference<FlowLock> resnapshotLock;
@ -293,6 +366,13 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
return stats.estimatedMaxResidentMemory >= memoryFullThreshold;
}
void triggerReadDrivenCompaction() {
Promise<Void> doRDC = doReadDrivenCompaction;
if (doRDC.canBeSet()) {
doRDC.send(Void());
}
}
bool maybeInjectTargetedRestart() {
// inject a BW restart at most once per test
if (g_network->isSimulated() && !g_simulator->speedUpSimulation &&
@ -1107,7 +1187,6 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
}
retries++;
CODE_PROBE(true, "Granule initial snapshot failed");
// FIXME: why can't we supress error event?
TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id)
.error(err)
.detail("Granule", metadata->keyRange)
@ -2043,6 +2122,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
metadata->pendingDeltaVersion = startVersion;
metadata->bufferedDeltaVersion = startVersion;
metadata->knownCommittedVersion = startVersion;
metadata->resetReadStats();
Reference<ChangeFeedData> cfData = makeReference<ChangeFeedData>(bwData->db.getPtr());
@ -2185,6 +2265,10 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
nextForceFlush = metadata->forceFlushVersion.whenAtLeast(lastForceFlushVersion + 1);
}
when(wait(metadata->runRDC.getFuture())) {
// return control flow back to the triggering actor before continuing
wait(delay(0));
}
}
} catch (Error& e) {
// only error we should expect here is when we finish consuming old change feed
@ -2311,6 +2395,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
startState.granuleID,
inFlightFiles.empty() ? Future<Void>(Void())
: success(inFlightFiles.back().future));
metadata->resetReadStats();
}
// reset force flush state, requests should retry and add it back once feed is ready
forceFlushVersions.clear();
@ -2419,20 +2504,20 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// The force flush contract is a version cannot be put in forceFlushVersion unless the change feed
// is already whenAtLeast that version
bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion;
bool doReadDrivenFlush = !metadata->currentDeltas.empty() && metadata->doReadDrivenCompaction();
CODE_PROBE(forceFlush, "Force flushing granule");
if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) {
if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush ||
doReadDrivenFlush) {
TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id)
.detail("Granule", metadata->keyRange)
.detail("Version", lastDeltaVersion);
// sanity check for version order
if (forceFlush) {
if (forceFlush || doReadDrivenFlush) {
if (lastDeltaVersion == invalidVersion) {
lastDeltaVersion = metadata->currentDeltas.empty() ? metadata->pendingDeltaVersion
: metadata->currentDeltas.back().version;
lastDeltaVersion = metadata->bufferedDeltaVersion;
}
if (lastDeltaVersion < forceFlushVersions.back()) {
if (!forceFlushVersions.empty() && lastDeltaVersion < forceFlushVersions.back()) {
if (BW_DEBUG) {
fmt::print("Granule [{0} - {1}) force flushing delta version {2} -> {3}\n",
metadata->keyRange.begin.printable(),
@ -2444,13 +2529,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
}
if (!metadata->currentDeltas.empty()) {
if (lastDeltaVersion < metadata->currentDeltas.back().version) {
fmt::print("Granule [{0} - {1}) LDV {2} < DeltaBack {3}\n",
metadata->keyRange.begin.printable(),
metadata->keyRange.end.printable(),
lastDeltaVersion,
metadata->currentDeltas.back().version);
}
ASSERT(lastDeltaVersion >= metadata->currentDeltas.back().version);
ASSERT(metadata->pendingDeltaVersion < metadata->currentDeltas.front().version);
} else {
@ -2507,6 +2585,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// add new pending delta file
ASSERT(metadata->pendingDeltaVersion < lastDeltaVersion);
metadata->pendingDeltaVersion = lastDeltaVersion;
ASSERT(metadata->bufferedDeltaVersion <= lastDeltaVersion);
metadata->bufferedDeltaVersion = lastDeltaVersion; // In case flush was forced at non-mutation version
metadata->bytesInNewDeltaFiles += metadata->bufferedDeltaBytes;
@ -2528,6 +2607,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// Wait on delta file starting here. If we have too many pending delta file writes, we need to not
// continue to consume from the change feed, as that will pile on even more delta files to write
wait(startDeltaFileWrite);
} else if (metadata->doReadDrivenCompaction()) {
ASSERT(metadata->currentDeltas.empty());
snapshotEligible = true;
}
// FIXME: if we're still reading from old change feed, we should probably compact if we're
@ -2535,7 +2617,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// yet
// If we have enough delta files, try to re-snapshot
if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) {
if (snapshotEligible && (metadata->doReadDrivenCompaction() ||
metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT)) {
if (BW_DEBUG && !inFlightFiles.empty()) {
fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, "
"waiting for "
@ -2583,6 +2666,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// reset metadata
metadata->bytesInNewDeltaFiles = 0;
metadata->resetReadStats();
// If we have more than one snapshot file and that file is unblocked (committedVersion >=
// snapshotVersion), wait for it to finish
@ -3740,6 +3824,11 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
}
}
}
// don't update read stats on a summarize read
if (metadata->updateReadStats(req.readVersion, chunk)) {
bwData->triggerReadDrivenCompaction();
}
}
rep.chunks.push_back(rep.arena, chunk);
@ -3961,7 +4050,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
}
if (createChangeFeed) {
if (createChangeFeed && !isFullRestoreMode()) {
// create new change feed for new version of granule
wait(updateChangeFeed(
&tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));
@ -4554,6 +4643,74 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
}
}
struct RDCEntry {
double weight;
Reference<GranuleMetadata> granule;
RDCEntry(double weight, Reference<GranuleMetadata> granule) : weight(weight), granule(granule) {}
};
// for a top-k algorithm, we actually want a min-heap, so reverse the sort order
struct OrderForTopK {
bool operator()(RDCEntry const& a, RDCEntry const& b) const { return b.weight - a.weight; }
};
typedef std::priority_queue<RDCEntry, std::vector<RDCEntry>, OrderForTopK> TopKPQ;
ACTOR Future<Void> runReadDrivenCompaction(Reference<BlobWorkerData> bwData) {
state bool processedAll = true;
loop {
if (processedAll) {
wait(bwData->doReadDrivenCompaction.getFuture());
bwData->doReadDrivenCompaction.reset();
wait(delay(0));
}
TopKPQ topK;
// FIXME: possible to scan candidates instead of all granules?
int candidates = 0;
auto allRanges = bwData->granuleMetadata.intersectingRanges(normalKeys);
for (auto& it : allRanges) {
if (it.value().activeMetadata.isValid() && it.value().activeMetadata->cancelled.canBeSet()) {
auto metadata = it.value().activeMetadata;
if (metadata->rdcCandidate && metadata->isEligibleRDC() && metadata->runRDC.canBeSet() &&
metadata->pendingSnapshotVersion == metadata->durableSnapshotVersion.get()) {
candidates++;
double weight = metadata->weightRDC();
if (weight > 1.0 &&
(topK.size() < SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM || weight > topK.top().weight)) {
if (topK.size() == SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM) {
topK.pop();
}
topK.push(RDCEntry(weight, metadata));
}
}
}
}
CODE_PROBE(candidates > topK.size(), "Too many read-driven compaction candidates for one cycle");
std::vector<Future<Void>> futures;
futures.reserve(topK.size());
while (!topK.empty()) {
++bwData->stats.readDrivenCompactions;
Promise<Void> runRDC = topK.top().granule->runRDC;
ASSERT(runRDC.canBeSet());
Future<Void> waitForSnapshotComplete = topK.top().granule->durableSnapshotVersion.whenAtLeast(
topK.top().granule->durableSnapshotVersion.get() + 1) ||
topK.top().granule->cancelled.getFuture();
futures.push_back(waitForSnapshotComplete);
topK.pop();
runRDC.send(Void());
}
processedAll = futures.empty();
if (!futures.empty()) {
// wait at least one second to throttle this actor a bit
wait(waitForAll(futures) && delay(1.0));
}
}
}
// FIXME: better way to do this?
// monitor system keyspace for new tenants
ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
@ -4891,6 +5048,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
self->addActor.send(runGRVChecks(self));
self->addActor.send(monitorTenants(self));
self->addActor.send(runReadDrivenCompaction(self));
state Future<Void> selfRemoved = monitorRemoval(self);
if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.25)) {
self->addActor.send(simForceFileWriteContention(self));
@ -5024,13 +5182,22 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
ASSERT(false);
throw internal_error();
}
when(wait(selfRemoved || self->simInjectFailure.getFuture())) {
when(wait(selfRemoved)) {
if (BW_DEBUG) {
printf("Blob worker detected removal. Exiting...\n");
}
TraceEvent("BlobWorkerRemoved", self->id);
break;
}
when(wait(self->simInjectFailure.getFuture())) {
// wait to let triggering actor finish to prevent weird shutdown races
wait(delay(0));
if (BW_DEBUG) {
printf("Blob worker simulation injected failure. Exiting...\n");
}
TraceEvent("BlobWorkerSimRemoved", self->id);
break;
}
when(wait(self->fatalError.getFuture())) {
TraceEvent(SevError, "BlobWorkerActorCollectionFatalErrorNotError", self->id);
ASSERT(false);

View File

@ -2615,8 +2615,9 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
}
loop {
if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
state Future<Void> wfClient =
waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
loop {
choose {
when(wait(wfClient)) {
@ -3006,11 +3007,10 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self));
self.addActor.send(traceCounters("ClusterControllerMetrics",
self.id,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&self.clusterControllerMetrics,
self.id.toString() + "/ClusterControllerMetrics"));
self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics",
self.id,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
self.id.toString() + "/ClusterControllerMetrics"));
self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());

View File

@ -183,8 +183,8 @@ class ConfigBroadcasterImpl {
id(deterministicRandom()->randomUniqueID()), cc("ConfigBroadcaster"), compactRequest("CompactRequest", cc),
successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
snapshotRequest("SnapshotRequest", cc) {
logger = traceCounters(
"ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigBroadcasterMetrics");
logger = cc.traceCounters(
"ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigBroadcasterMetrics");
}
void addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> const& changes,

View File

@ -812,7 +812,7 @@ public:
successfulCommits("SuccessfulCommits", cc), failedCommits("FailedCommits", cc),
setMutations("SetMutations", cc), clearMutations("ClearMutations", cc),
getValueRequests("GetValueRequests", cc), getGenerationRequests("GetGenerationRequests", cc) {
logger = traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigNode");
logger = cc.traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigNode");
TraceEvent(SevInfo, "StartingConfigNode", id).detail("KVStoreAlreadyExists", kvStore.exists());
}

View File

@ -29,7 +29,7 @@
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/ServerDBInfo.h"
@ -393,6 +393,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
state double rateLimiterStartTime = now();
state int64_t bytesReadInthisRound = 0;
state bool resume = !(restart || shuffleShards);
state bool testResult = true;
state double dbSize = 100e12;
if (g_network->isSimulated()) {
@ -710,7 +711,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
(!storageServerInterfaces[j].isTss() &&
!storageServerInterfaces[firstValidServer].isTss())) {
testFailure("Data inconsistent", performQuiescentChecks, true);
return false;
testResult = false;
}
}
}
@ -949,7 +950,7 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
}
*bytesReadInPrevRound = bytesReadInthisRound;
return true;
return testResult;
}
ACTOR Future<Void> runDataValidationCheck(ConsistencyScanData* self) {

View File

@ -689,6 +689,17 @@ struct DDQueue : public IDDRelocationQueue {
int moveReusePhysicalShard;
int moveCreateNewPhysicalShard;
enum RetryFindDstReason {
None = 0,
RemoteBestTeamNotReady,
PrimaryNoHealthyTeam,
RemoteNoHealthyTeam,
RemoteTeamIsFull,
RemoteTeamIsNotHealthy,
NoAvailablePhysicalShard,
NumberOfTypes,
};
std::vector<int> retryFindDstReasonCount;
void startRelocation(int priority, int healthPriority) {
// Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement,
@ -754,7 +765,8 @@ struct DDQueue : public IDDRelocationQueue {
suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar<bool>(false)),
rawProcessingWiggle(new AsyncVar<bool>(false)), unhealthyRelocations(0),
movedKeyServersEventHolder(makeReference<EventCacheHolder>("MovedKeyServers")), moveReusePhysicalShard(0),
moveCreateNewPhysicalShard(0) {}
moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast<int>(RetryFindDstReason::NumberOfTypes), 0) {
}
DDQueue() = default;
void validate() {
@ -1467,6 +1479,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
loop {
destOverloadedCount = 0;
stuckCount = 0;
state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None;
// state int bestTeamStuckThreshold = 50;
loop {
state int tciIndex = 0;
@ -1493,10 +1506,13 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
.detail("TeamCollectionIndex", tciIndex)
.detail("RestoreDataMoveForDest",
describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest));
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
foundTeams = false;
break;
}
if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) {
retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
: DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
foundTeams = false;
break;
}
@ -1549,12 +1565,15 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// getting the destination team or we could miss failure notifications for the storage
// servers in the destination team
TraceEvent("BestTeamNotReady");
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady;
foundTeams = false;
break;
}
// If a DC has no healthy team, we stop checking the other DCs until
// the unhealthy DC is healthy again or is excluded.
if (!bestTeam.first.present()) {
retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam
: DDQueue::RetryFindDstReason::RemoteNoHealthyTeam;
foundTeams = false;
break;
}
@ -1578,6 +1597,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
foundTeams = false;
break;
}
@ -1620,6 +1640,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
if (!bestTeams[1].first->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
}
}
@ -1684,6 +1705,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
self->moveReusePhysicalShard++;
} else {
self->moveCreateNewPhysicalShard++;
if (retryFindDstReason == DDQueue::RetryFindDstReason::None) {
// When creating a new physical shard, but the reason is none, this can only happen when
// determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical
// shard.
self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++;
} else {
self->retryFindDstReasonCount[retryFindDstReason]++;
}
}
rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False);
auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
@ -2485,9 +2514,25 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
TraceEvent("PhysicalShardMoveStats")
.detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard)
.detail("MoveReusePhysicalShard", self.moveReusePhysicalShard);
.detail("MoveReusePhysicalShard", self.moveReusePhysicalShard)
.detail("RemoteBestTeamNotReady",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady])
.detail("PrimaryNoHealthyTeam",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam])
.detail("RemoteNoHealthyTeam",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam])
.detail("RemoteTeamIsFull",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
.detail("RemoteTeamIsNotHealthy",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
.detail(
"NoAvailablePhysicalShard",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);
self.moveCreateNewPhysicalShard = 0;
self.moveReusePhysicalShard = 0;
for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) {
self.retryFindDstReasonCount[i] = 0;
}
}
}
when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator

View File

@ -212,7 +212,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
const Reference<AsyncVar<Optional<ShardMetrics>>>& shardMetrics,
const BandwidthStatus& bandwidthStatus,
PromiseStream<KeyRange> readHotShard) {
ShardSizeBounds bounds;
ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
if (shardMetrics->get().present()) {
auto bytes = shardMetrics->get().get().metrics.bytes;
auto readBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics);
@ -259,21 +259,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys,
} else {
ASSERT(false);
}
} else {
bounds.max.bytes = -1;
bounds.min.bytes = -1;
bounds.permittedError.bytes = -1;
bounds.max.bytesPerKSecond = bounds.max.infinity;
bounds.min.bytesPerKSecond = 0;
bounds.permittedError.bytesPerKSecond = bounds.permittedError.infinity;
bounds.max.bytesReadPerKSecond = bounds.max.infinity;
bounds.min.bytesReadPerKSecond = 0;
bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity;
}
bounds.max.iosPerKSecond = bounds.max.infinity;
bounds.min.iosPerKSecond = 0;
bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity;
return bounds;
}

View File

@ -895,7 +895,7 @@ public:
if (maxPriority < SERVER_KNOBS->PRIORITY_TEAM_FAILED) {
std::pair<std::vector<ShardsAffectedByTeamFailure::Team>,
std::vector<ShardsAffectedByTeamFailure::Team>>
teams = self->shardsAffectedByTeamFailure->getTeamsFor(shards[i]);
teams = self->shardsAffectedByTeamFailure->getTeamsForFirstShard(shards[i]);
for (int j = 0; j < teams.first.size() + teams.second.size(); j++) {
// t is the team in primary DC or the remote DC
auto& t =

View File

@ -763,7 +763,7 @@ std::vector<DDShardInfo> DDMockTxnProcessor::getDDShardInfos() const {
KeyRangeRef curRange = it->range();
DDShardInfo info(curRange.begin);
auto teams = mgs->shardMapping->getTeamsFor(curRange);
auto teams = mgs->shardMapping->getTeamsForFirstShard(curRange);
if (!teams.first.empty() && !teams.second.empty()) {
CODE_PROBE(true, "Mock InitialDataDistribution In-Flight shard");
info.hasDest = true;
@ -816,7 +816,7 @@ Future<Void> DDMockTxnProcessor::removeStorageServer(const UID& serverID,
const Optional<UID>& tssPairID,
const MoveKeysLock& lock,
const DDEnabledState* ddEnabledState) const {
ASSERT(mgs->allShardRemovedFromServer(serverID));
ASSERT(mgs->allShardsRemovedFromServer(serverID));
mgs->allServers.erase(serverID);
return Void();
}
@ -862,16 +862,14 @@ Future<HealthMetrics> DDMockTxnProcessor::getHealthMetrics(bool detailed) const
return Future<HealthMetrics>();
}
// FIXME: finish implementation
Future<Standalone<VectorRef<KeyRef>>> DDMockTxnProcessor::splitStorageMetrics(
const KeyRange& keys,
const StorageMetrics& limit,
const StorageMetrics& estimated,
const Optional<int>& minSplitBytes) const {
return Future<Standalone<VectorRef<KeyRef>>>();
return mgs->splitStorageMetrics(keys, limit, estimated, minSplitBytes);
}
// FIXME: finish implementation
Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorageMetrics(
const KeyRange& keys,
const StorageMetrics& min,
@ -879,7 +877,7 @@ Future<std::pair<Optional<StorageMetrics>, int>> DDMockTxnProcessor::waitStorage
const StorageMetrics& permittedError,
int shardLimit,
int expectedShardCount) const {
return Future<std::pair<Optional<StorageMetrics>, int>>();
return mgs->waitStorageMetrics(keys, min, max, permittedError, shardLimit, expectedShardCount);
}
// FIXME: finish implementation
@ -910,7 +908,7 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params,
ASSERT(params.finishMoveKeysParallelismLock->take().isReady());
// get source and dest teams
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys);
auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys);
ASSERT_EQ(destTeams.size(), 0);
if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) {

View File

@ -53,6 +53,20 @@
#include "fdbserver/DDSharedContext.h"
#include "flow/actorcompiler.h" // This must be the last #include.
ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() {
return ShardSizeBounds{
.max = StorageMetrics{ .bytes = -1,
.bytesPerKSecond = StorageMetrics::infinity,
.iosPerKSecond = StorageMetrics::infinity,
.bytesReadPerKSecond = StorageMetrics::infinity },
.min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 },
.permittedError = StorageMetrics{ .bytes = -1,
.bytesPerKSecond = StorageMetrics::infinity,
.iosPerKSecond = StorageMetrics::infinity,
.bytesReadPerKSecond = StorageMetrics::infinity }
};
}
struct DDAudit {
DDAudit(UID id, KeyRange range, AuditType type)
: id(id), range(range), type(type), auditMap(AuditPhase::Invalid, allKeys.end), actors(true) {}
@ -286,8 +300,6 @@ public:
PromiseStream<RelocateShard> relocationProducer, relocationConsumer;
Reference<PhysicalShardCollection> physicalShardCollection;
StorageQuotaInfo storageQuotaInfo;
Promise<Void> initialized;
std::unordered_map<AuditType, std::vector<std::shared_ptr<DDAudit>>> audits;
@ -542,27 +554,6 @@ public:
}
};
ACTOR Future<Void> storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) {
loop {
state Transaction tr(cx);
loop {
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size());
for (auto const kv : currentQuotas) {
Key const key = kv.key.removePrefix(storageQuotaPrefix);
uint64_t const quota = BinaryReader::fromStringRef<uint64_t>(kv.value, Unversioned());
storageQuotaInfo->quotaMap[key] = quota;
}
wait(delay(5.0));
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
}
// Periodically check and log the physicalShard status; clean up empty physicalShard;
ACTOR Future<Void> monitorPhysicalShardStatus(Reference<PhysicalShardCollection> self) {
ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA);
@ -683,16 +674,15 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo),
"StorageQuotaTracker",
self->ddId,
&normalDDQueueErrors()));
if (ddIsTenantAware) {
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorTenantMap(),
"DDTenantCacheMonitor",
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageQuota(),
"StorageQuotaTracker",
self->ddId,
&normalDDQueueErrors()));
actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(),
"StorageUsageTracker",
self->ddId,

View File

@ -202,7 +202,8 @@ class GlobalTagThrottlerImpl {
for (const auto& [id, _] : throughput) {
result += getCurrentCost(id, tag).orDefault(0);
}
TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);
// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
//TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result);
return result;
}
@ -235,10 +236,13 @@ class GlobalTagThrottlerImpl {
return 1.0;
}
auto const transactionRate = stats.get().getTransactionRate();
// FIXME: Disabled due to noisy trace events. Fix the noise and reenabled
/*
TraceEvent("GlobalTagThrottler_GetAverageTransactionCost")
.detail("Tag", tag)
.detail("TransactionRate", transactionRate)
.detail("Cost", cost);
*/
if (transactionRate == 0.0) {
return 1.0;
} else {

View File

@ -154,7 +154,7 @@ struct GrvProxyStats {
return int64_t(100 * this->percentageOfBatchGRVQueueProcessed);
});
logger = traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "GrvProxyMetrics");
logger = cc.traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "GrvProxyMetrics");
for (int i = 0; i < FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS; i++) {
requestBuckets.push_back(0);
}
@ -459,9 +459,9 @@ void dropRequestFromQueue(Deque<GetReadVersionRequest>* queue, GrvProxyStats* st
// Put a GetReadVersion request into the queue corresponding to its priority.
ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo> const> db,
SpannedDeque<GetReadVersionRequest>* systemQueue,
SpannedDeque<GetReadVersionRequest>* defaultQueue,
SpannedDeque<GetReadVersionRequest>* batchQueue,
Deque<GetReadVersionRequest>* systemQueue,
Deque<GetReadVersionRequest>* defaultQueue,
Deque<GetReadVersionRequest>* batchQueue,
FutureStream<GetReadVersionRequest> readVersionRequests,
PromiseStream<Void> GRVTimer,
double* lastGRVTime,
@ -531,7 +531,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
stats->txnSystemPriorityStartIn += req.transactionCount;
++stats->systemGRVQueueSize;
systemQueue->push_back(req);
// systemQueue->span.addParent(req.spanContext);
} else if (req.priority >= TransactionPriority::DEFAULT) {
++stats->txnRequestIn;
stats->txnStartIn += req.transactionCount;
@ -542,7 +541,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
} else {
defaultQueue->push_back(req);
}
// defaultQueue->span.addParent(req.spanContext);
} else {
// Return error for batch_priority GRV requests
int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1);
@ -559,7 +557,6 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>
} else {
batchQueue->push_back(req);
}
// batchQueue->span.addParent(req.spanContext);
}
}
}
@ -607,7 +604,7 @@ ACTOR Future<Void> lastCommitUpdater(GrvProxyData* self, PromiseStream<Future<Vo
}
}
ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan,
ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(std::vector<SpanContext> spanContexts,
GrvProxyData* grvProxyData,
uint32_t flags,
Optional<UID> debugID,
@ -620,7 +617,10 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanContext parentSpan
// before the request returns, so it is committed. (2) No proxy on our list reported committed a higher version
// before this request was received, because then its committedVersion would have been higher,
// and no other proxy could have already committed anything without first ending the epoch
state Span span("GP:getLiveCommittedVersion"_loc, parentSpan);
state Span span("GP:getLiveCommittedVersion"_loc);
for (const SpanContext& spanContext : spanContexts) {
span.addLink(spanContext);
}
++grvProxyData->stats.txnStartBatch;
state double grvStart = now();
@ -826,15 +826,14 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
state GrvTransactionRateInfo batchRateInfo(0);
state GrvProxyTransactionTagThrottler tagThrottler;
state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc);
state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc);
state SpannedDeque<GetReadVersionRequest> batchQueue("GP:transactionStarterBatchQueue"_loc);
state Deque<GetReadVersionRequest> systemQueue;
state Deque<GetReadVersionRequest> defaultQueue;
state Deque<GetReadVersionRequest> batchQueue;
state TransactionTagMap<uint64_t> transactionTagCounter;
state PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientThrottledTags;
state PromiseStream<double> normalGRVLatency;
// state Span span;
state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
getCurrentLineage()->modify(&TransactionLineage::operation) =
@ -911,7 +910,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
uint32_t defaultQueueSize = defaultQueue.size();
uint32_t batchQueueSize = batchQueue.size();
while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) {
SpannedDeque<GetReadVersionRequest>* transactionQueue;
Deque<GetReadVersionRequest>* transactionQueue;
if (!systemQueue.empty()) {
transactionQueue = &systemQueue;
} else if (!defaultQueue.empty()) {
@ -921,7 +920,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
} else {
break;
}
// transactionQueue->span.swap(span);
auto& req = transactionQueue->front();
int tc = req.transactionCount;
@ -1017,7 +1015,13 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
int batchGRVProcessed = 0;
for (int i = 0; i < start.size(); i++) {
if (start[i].size()) {
Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(SpanContext(),
std::vector<SpanContext> spanContexts;
spanContexts.reserve(start[i].size());
for (const GetReadVersionRequest& request : start[i]) {
spanContexts.push_back(request.spanContext);
}
Future<GetReadVersionReply> readVersionReply = getLiveCommittedVersion(spanContexts,
grvProxyData,
i,
debugID,
@ -1041,7 +1045,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
batchGRVProcessed += batchPriTransactionsStarted[i];
}
}
// span = Span(span.location);
grvProxyData->stats.percentageOfDefaultGRVQueueProcessed =
defaultQueueSize ? (double)defaultGRVProcessed / defaultQueueSize : 1;

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbclient/Knobs.h"
#include "fdbserver/GrvProxyTransactionTagThrottler.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // must be last include
@ -28,6 +29,10 @@ void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDur
req.proxyTagThrottledDuration = now() - startTime;
}
bool GrvProxyTransactionTagThrottler::DelayedRequest::isMaxThrottled() const {
return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION;
}
void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
if (rateInfo.present()) {
rateInfo.get().setRate(rate);
@ -36,6 +41,20 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
}
}
bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const {
return !requests.empty() && requests.front().isMaxThrottled();
}
void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() {
CODE_PROBE(true, "GrvProxyTransactionTagThrottler rejecting requests");
while (!requests.empty()) {
auto& delayedReq = requests.front();
delayedReq.updateProxyTagThrottledDuration();
delayedReq.req.reply.sendError(proxy_tag_throttled());
requests.pop_front();
}
}
void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
for (const auto& [tag, rate] : newRates) {
auto it = queues.find(tag);
@ -73,6 +92,7 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
// unexpected behaviour, because only one tag is used for throttling.
TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags")
.suppressFor(1.0)
.detail("NumTags", req.tags.size())
.detail("UsingTag", printable(tag));
}
@ -80,8 +100,8 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re
}
void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
SpannedDeque<GetReadVersionRequest>& outBatchPriority,
SpannedDeque<GetReadVersionRequest>& outDefaultPriority) {
Deque<GetReadVersionRequest>& outBatchPriority,
Deque<GetReadVersionRequest>& outDefaultPriority) {
// Pointer to a TagQueue with some extra metadata stored alongside
struct TagQueueHandle {
// Store pointers here to avoid frequent std::unordered_map lookups
@ -140,6 +160,11 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
// Cannot release any more transaction from this tag (don't push the tag queue handle back into
// pqOfQueues)
CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction");
if (tagQueueHandle.queue->isMaxThrottled()) {
// Requests in this queue have been throttled too long and errors
// should be sent to clients.
tagQueueHandle.queue->rejectRequests();
}
break;
} else {
if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) {
@ -255,8 +280,8 @@ ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* thrott
}
ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler) {
state SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
state SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
state Deque<GetReadVersionRequest> outBatchPriority;
state Deque<GetReadVersionRequest> outDefaultPriority;
loop {
state double elapsed = (0.009 + 0.002 * deterministicRandom()->random01());
wait(delay(elapsed));
@ -379,8 +404,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {
throttler.updateRates(TransactionTagMap<double>{});
ASSERT_EQ(throttler.size(), 1);
{
SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
Deque<GetReadVersionRequest> outBatchPriority;
Deque<GetReadVersionRequest> outDefaultPriority;
throttler.releaseTransactions(0.1, outBatchPriority, outDefaultPriority);
}
// Calling updates cleans up the queues in throttler

View File

@ -347,8 +347,8 @@ public:
Randomize::False,
g_network->isSimulated() ? IsSimulated::True : IsSimulated::False);
}
logger = traceCounters(
"LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LocalConfigurationMetrics");
logger = cc.traceCounters(
"LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "LocalConfigurationMetrics");
}
Future<Void> addChanges(Standalone<VectorRef<VersionedConfigMutationRef>> changes,

View File

@ -190,15 +190,14 @@ struct LogRouterData {
});
specialCounter(cc, "Generation", [this]() { return this->generation; });
specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; });
logger = traceCounters("LogRouterMetrics",
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
&cc,
"LogRouterMetrics",
[this](TraceEvent& te) {
te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
te.detail("RouterTag", this->routerTag.toString());
});
logger = cc.traceCounters("LogRouterMetrics",
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
"LogRouterMetrics",
[this](TraceEvent& te) {
te.detail("PrimaryPeekLocation", this->primaryPeekLocation);
te.detail("RouterTag", this->routerTag.toString());
});
}
};

View File

@ -0,0 +1,623 @@
/*
* MockGlobalState.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/MockGlobalState.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/DataDistribution.actor.h"
#include "flow/actorcompiler.h"
class MockGlobalStateImpl {
public:
ACTOR static Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(MockGlobalState* mgs,
KeyRange keys,
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError,
int shardLimit,
int expectedShardCount) {
state TenantInfo tenantInfo;
loop {
auto locations = mgs->getKeyRangeLocations(tenantInfo,
keys,
shardLimit,
Reverse::False,
SpanContext(),
Optional<UID>(),
UseProvisionalProxies::False,
0)
.get();
TraceEvent(SevDebug, "MGSWaitStorageMetrics").detail("Phase", "GetLocation");
// NOTE(xwang): in native API, there's code handling the non-equal situation, but I think in mock world
// there shouldn't have any delay to update the locations.
ASSERT_EQ(expectedShardCount, locations.size());
Optional<StorageMetrics> res =
wait(::waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
// SOMEDAY: reuse the NativeAPI implementation
ACTOR static Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(MockGlobalState* mgs,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated,
Optional<int> minSplitBytes) {
state TenantInfo tenantInfo;
loop {
state std::vector<KeyRangeLocationInfo> locations =
mgs->getKeyRangeLocations(tenantInfo,
keys,
CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT,
Reverse::False,
SpanContext(),
Optional<UID>(),
UseProvisionalProxies::False,
0)
.get();
// Same solution to NativeAPI::splitStorageMetrics, wait some merge finished
if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) {
wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution));
}
Optional<Standalone<VectorRef<KeyRef>>> results =
wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes));
if (results.present()) {
return results.get();
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
};
class MockStorageServerImpl {
public:
ACTOR static Future<Void> waitMetricsTenantAware(MockStorageServer* self, WaitMetricsRequest req) {
if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
// TODO(xwang) add support for tenant test, search for tenant entry
Optional<TenantMapEntry> entry;
Optional<Key> tenantPrefix = entry.map<Key>([](TenantMapEntry e) { return e.prefix; });
if (tenantPrefix.present()) {
UNREACHABLE();
// req.keys = req.keys.withPrefix(tenantPrefix.get(), req.arena);
}
}
if (!self->isReadable(req.keys)) {
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
wait(self->metrics.waitMetrics(req, delayJittered(SERVER_KNOBS->STORAGE_METRIC_TIMEOUT)));
}
return Void();
}
};
bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty()); // at least the range is allKeys
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
if (it->cvalue().status != status)
return false;
}
return true;
}
void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
// now the boundary must be aligned
ASSERT(ranges.begin().begin() == range.begin);
ASSERT(ranges.end().end() == range.end);
uint64_t newSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
newSize += it->cvalue().shardSize;
}
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
auto oldStatus = it.value().status;
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
.detail("To", status)
.detail("ID", id)
.detail("KeyBegin", range.begin.toHexString())
.detail("KeyEnd", range.begin.toHexString());
}
}
serverKeys.coalesce(range);
}
// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
// size of the new shards are randomly split from old size of [a, d)
void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
uint64_t outerRangeSize,
bool restrictSize) {
ASSERT(outerRange.contains(innerRange));
Key left = outerRange.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int midSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? outerRangeSize - leftSize - midSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
serverKeys[left].shardSize = leftSize;
serverKeys[innerRange.end].shardSize = rightSize;
}
// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
// size of the new shards are randomly split from old size of [a, c)
void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
KeyRef splitPoint,
uint64_t rangeSize,
bool restrictSize) {
Key left = range.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
: SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? rangeSize - leftSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
serverKeys[left].shardSize = leftSize;
}
void MockStorageServer::removeShard(KeyRangeRef range) {
auto ranges = serverKeys.containedRanges(range);
ASSERT(ranges.begin().range() == range);
serverKeys.rawErase(range);
}
uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
auto ranges = serverKeys.intersectingRanges(range);
uint64_t totalSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
totalSize += it->cvalue().shardSize;
}
return totalSize;
}
void MockStorageServer::addActor(Future<Void> future) {
actors.add(future);
}
void MockStorageServer::getSplitPoints(const SplitRangeRequest& req) {}
Future<Void> MockStorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) {
return MockStorageServerImpl::waitMetricsTenantAware(this, req);
}
void MockStorageServer::getStorageMetrics(const GetStorageMetricsRequest& req) {}
Future<Void> MockStorageServer::run() {
ssi.locality = LocalityData(Optional<Standalone<StringRef>>(),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Optional<Standalone<StringRef>>());
ssi.initEndpoints();
ssi.startAcceptingRequests();
TraceEvent("MockStorageServerStart").detail("Address", ssi.address());
return serveStorageMetricsRequests(this, ssi);
}
void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
ASSERT(conf.storageTeamSize > 0);
configuration = conf;
std::vector<UID> serverIds;
for (int i = 1; i <= conf.storageTeamSize; ++i) {
UID id = indexToUID(i);
serverIds.push_back(id);
allServers[id] = MockStorageServer(id, defaultDiskSpace);
allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
}
shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
}
void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
allServers[server.id()] = MockStorageServer(server, diskSpace);
}
bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsForFirstShard(shard);
if (inFlightShard) {
return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
return std::any_of(
teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
}
bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsForFirstShard(shard);
return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
bool MockGlobalState::allShardsRemovedFromServer(const UID& serverId) {
return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
}
Future<std::pair<Optional<StorageMetrics>, int>> MockGlobalState::waitStorageMetrics(
const KeyRange& keys,
const StorageMetrics& min,
const StorageMetrics& max,
const StorageMetrics& permittedError,
int shardLimit,
int expectedShardCount) {
return MockGlobalStateImpl::waitStorageMetrics(
this, keys, min, max, permittedError, shardLimit, expectedShardCount);
}
Reference<LocationInfo> buildLocationInfo(const std::vector<StorageServerInterface>& interfaces) {
// construct the location info with the servers
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> serverRefs;
serverRefs.reserve(interfaces.size());
for (const auto& interf : interfaces) {
serverRefs.push_back(makeReference<ReferencedInterface<StorageServerInterface>>(interf));
}
return makeReference<LocationInfo>(serverRefs);
}
Future<KeyRangeLocationInfo> MockGlobalState::getKeyLocation(TenantInfo tenant,
Key key,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Reverse isBackward,
Version version) {
if (isBackward) {
// DD never ask for backward range.
UNREACHABLE();
}
ASSERT(key < allKeys.end);
GetKeyServerLocationsReply rep;
KeyRange single = singleKeyRange(key);
auto teamPair = shardMapping->getTeamsForFirstShard(single);
auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
ASSERT_EQ(srcTeam.size(), 1);
rep.results.emplace_back(single, extractStorageServerInterfaces(srcTeam.front().servers));
return KeyRangeLocationInfo(
rep.tenantEntry,
KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena),
buildLocationInfo(rep.results[0].second));
}
Future<std::vector<KeyRangeLocationInfo>> MockGlobalState::getKeyRangeLocations(
TenantInfo tenant,
KeyRange keys,
int limit,
Reverse reverse,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Version version) {
if (reverse) {
// DD never ask for backward range.
ASSERT(false);
}
ASSERT(keys.begin < keys.end);
GetKeyServerLocationsReply rep;
auto ranges = shardMapping->intersectingRanges(keys);
auto it = ranges.begin();
for (int count = 0; it != ranges.end() && count < limit; ++it, ++count) {
auto teamPair = shardMapping->getTeamsFor(it->begin());
auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second;
ASSERT_EQ(srcTeam.size(), 1);
rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers));
}
CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare);
std::vector<KeyRangeLocationInfo> results;
for (int shard = 0; shard < rep.results.size(); shard++) {
results.emplace_back(rep.tenantEntry,
(toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys),
buildLocationInfo(rep.results[shard].second));
}
return results;
}
std::vector<StorageServerInterface> MockGlobalState::extractStorageServerInterfaces(const std::vector<UID>& ids) const {
std::vector<StorageServerInterface> interfaces;
for (auto& id : ids) {
interfaces.emplace_back(allServers.at(id).ssi);
}
return interfaces;
}
Future<Standalone<VectorRef<KeyRef>>> MockGlobalState::splitStorageMetrics(const KeyRange& keys,
const StorageMetrics& limit,
const StorageMetrics& estimated,
const Optional<int>& minSplitBytes) {
return MockGlobalStateImpl::splitStorageMetrics(this, keys, limit, estimated, minSplitBytes);
}
TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 3;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
auto id = MockGlobalState::indexToUID(i);
std::cout << "Check server " << i << "\n";
ASSERT(mgs->serverIsSourceForShard(id, allKeys));
ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
}
return Void();
}
struct MockGlobalStateTester {
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
void testThreeWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.ranges().begin();
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
MockShardStatus oldStatus = it.cvalue().status;
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
Key x2 = keyAfter(x1);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
ranges.pop_front();
ASSERT(ranges.empty());
}
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
void testTwoWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.nthRange(0);
MockShardStatus oldStatus = it.cvalue().status;
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.empty());
}
KeyRangeLocationInfo getKeyLocationInfo(KeyRef key, std::shared_ptr<MockGlobalState> mgs) {
return mgs
->getKeyLocation(
TenantInfo(), key, SpanContext(), Optional<UID>(), UseProvisionalProxies::False, Reverse::False, 0)
.get();
}
std::vector<KeyRangeLocationInfo> getKeyRangeLocations(KeyRangeRef keys,
int limit,
std::shared_ptr<MockGlobalState> mgs) {
return mgs
->getKeyRangeLocations(TenantInfo(),
keys,
limit,
Reverse::False,
SpanContext(),
Optional<UID>(),
UseProvisionalProxies::False,
0)
.get();
}
};
TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
MockGlobalStateTester tester;
auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
std::cout << "Test 3-way splitting...\n";
tester.testThreeWaySplitFirstRange(mss);
std::cout << "Test 2-way splitting...\n";
mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
tester.testTwoWaySplitFirstRange(mss);
return Void();
}
namespace {
inline bool locationInfoEqualsToTeam(Reference<LocationInfo> loc, const std::vector<UID>& ids) {
return loc->locations()->size() == ids.size() &&
std::all_of(ids.begin(), ids.end(), [loc](const UID& id) { return loc->locations()->hasInterface(id); });
}
}; // namespace
TEST_CASE("/MockGlobalState/MockStorageServer/GetKeyLocations") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
// add one empty server
mgs->addStorageServer(StorageServerInterface(mgs->indexToUID(mgs->allServers.size() + 1)));
// define 3 ranges:
// team 1 (UID 1,2,...,n-1):[begin, 1.0), [2.0, end)
// team 2 (UID 2,3,...n-1, n): [1.0, 2.0)
ShardsAffectedByTeamFailure::Team team1, team2;
for (int i = 0; i < mgs->allServers.size() - 1; ++i) {
UID id = mgs->indexToUID(i + 1);
team1.servers.emplace_back(id);
id = mgs->indexToUID(i + 2);
team2.servers.emplace_back(id);
}
Key one = doubleToTestKey(1.0), two = doubleToTestKey(2.0);
std::vector<KeyRangeRef> ranges{ KeyRangeRef(allKeys.begin, one),
KeyRangeRef(one, two),
KeyRangeRef(two, allKeys.end) };
mgs->shardMapping->assignRangeToTeams(ranges[0], { team1 });
mgs->shardMapping->assignRangeToTeams(ranges[1], { team2 });
mgs->shardMapping->assignRangeToTeams(ranges[2], { team1 });
// query key location
MockGlobalStateTester tester;
// -- team 1
Key testKey = doubleToTestKey(0.5);
auto locInfo = tester.getKeyLocationInfo(testKey, mgs);
ASSERT(locationInfoEqualsToTeam(locInfo.locations, team1.servers));
// -- team 2
testKey = doubleToTestKey(1.3);
locInfo = tester.getKeyLocationInfo(testKey, mgs);
ASSERT(locationInfoEqualsToTeam(locInfo.locations, team2.servers));
// query range location
testKey = doubleToTestKey(3.0);
// team 1,2,1
auto locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 100, mgs);
ASSERT(locInfos.size() == 3);
ASSERT(locInfos[0].range == ranges[0]);
ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
ASSERT(locInfos[1].range == ranges[1]);
ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
ASSERT(locInfos[2].range == KeyRangeRef(ranges[2].begin, testKey));
ASSERT(locationInfoEqualsToTeam(locInfos[2].locations, team1.servers));
// team 1,2
locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 2, mgs);
ASSERT(locInfos.size() == 2);
ASSERT(locInfos[0].range == ranges[0]);
ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers));
ASSERT(locInfos[1].range == ranges[1]);
ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers));
return Void();
}
TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("WaitStorageMetricsRequestUnitTestConfig").detail("Config", dbConfig.toString());
state std::shared_ptr<MockGlobalState> mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
state ActorCollection actors;
ActorCollection* ptr = &actors; // get around ACTOR syntax restriction
std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) {
ptr->add(server.second.run());
IFailureMonitor::failureMonitor().setStatus(server.second.ssi.address(), FailureStatus(false));
server.second.metrics.byteSample.sample.insert("something"_sr, 500000);
});
KeyRange testRange = allKeys;
ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack();
std::pair<Optional<StorageMetrics>, int> res =
wait(mgs->waitStorageMetrics(testRange, bounds.min, bounds.max, bounds.permittedError, 1, 1));
// std::cout << "get result " << res.second << "\n";
// std::cout << "get byte "<< res.first.get().bytes << "\n";
ASSERT_EQ(res.second, -1); // the valid result always return -1, strange contraction though.
ASSERT_EQ(res.first.get().bytes, 500000);
return Void();
}

View File

@ -1,281 +0,0 @@
/*
* MockGlobalState.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/MockGlobalState.h"
bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty()); // at least the range is allKeys
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
if (it->cvalue().status != status)
return false;
}
return true;
}
void MockStorageServer::setShardStatus(KeyRangeRef range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
ASSERT(!ranges.empty());
if (ranges.begin().range().contains(range)) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
return;
}
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().end() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
twoWayShardSplitting(ranges.end().range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
}
ranges = serverKeys.containedRanges(range);
// now the boundary must be aligned
ASSERT(ranges.begin().begin() == range.begin);
ASSERT(ranges.end().end() == range.end);
uint64_t newSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
newSize += it->cvalue().shardSize;
}
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
auto oldStatus = it.value().status;
if (isStatusTransitionValid(oldStatus, status)) {
it.value() = ShardInfo{ status, newSize };
} else if (oldStatus == MockShardStatus::COMPLETED && status == MockShardStatus::INFLIGHT) {
CODE_PROBE(true, "Shard already on server");
} else {
TraceEvent(SevError, "MockShardStatusTransitionError")
.detail("From", oldStatus)
.detail("To", status)
.detail("ID", id)
.detail("KeyBegin", range.begin.toHexString())
.detail("KeyEnd", range.begin.toHexString());
}
}
serverKeys.coalesce(range);
}
// split the out range [a, d) based on the inner range's boundary [b, c). The result would be [a,b), [b,c), [c,d). The
// size of the new shards are randomly split from old size of [a, d)
void MockStorageServer::threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
uint64_t outerRangeSize,
bool restrictSize) {
ASSERT(outerRange.contains(innerRange));
Key left = outerRange.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - 2 * SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int midSize = deterministicRandom()->randomInt(
SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? outerRangeSize - leftSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1 : SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? outerRangeSize - leftSize - midSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.insert(innerRange, { serverKeys[left].status, (uint64_t)midSize });
serverKeys[left].shardSize = leftSize;
serverKeys[innerRange.end].shardSize = rightSize;
}
// split the range [a,c) with split point b. The result would be [a, b), [b, c). The
// size of the new shards are randomly split from old size of [a, c)
void MockStorageServer::twoWayShardSplitting(KeyRangeRef range,
KeyRef splitPoint,
uint64_t rangeSize,
bool restrictSize) {
Key left = range.begin;
// random generate 3 shard sizes, the caller guarantee that the min, max parameters are always valid.
int leftSize = deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES,
restrictSize ? rangeSize - SERVER_KNOBS->MIN_SHARD_BYTES + 1
: SERVER_KNOBS->MAX_SHARD_BYTES);
int rightSize =
restrictSize ? rangeSize - leftSize
: deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, SERVER_KNOBS->MAX_SHARD_BYTES);
serverKeys.rawInsert(splitPoint, { serverKeys[left].status, (uint64_t)rightSize });
serverKeys[left].shardSize = leftSize;
}
void MockStorageServer::removeShard(KeyRangeRef range) {
auto ranges = serverKeys.containedRanges(range);
ASSERT(ranges.begin().range() == range);
serverKeys.rawErase(range);
}
uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const {
auto ranges = serverKeys.intersectingRanges(range);
uint64_t totalSize = 0;
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
totalSize += it->cvalue().shardSize;
}
return totalSize;
}
void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) {
ASSERT(conf.storageTeamSize > 0);
configuration = conf;
std::vector<UID> serverIds;
for (int i = 1; i <= conf.storageTeamSize; ++i) {
UID id = indexToUID(i);
serverIds.push_back(id);
allServers[id] = MockStorageServer(id, defaultDiskSpace);
allServers[id].serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 });
}
shardMapping->assignRangeToTeams(allKeys, { Team(serverIds, true) });
}
void MockGlobalState::addStorageServer(StorageServerInterface server, uint64_t diskSpace) {
allServers[server.id()] = MockStorageServer(server, diskSpace);
}
bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef shard, bool inFlightShard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::COMPLETED)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsFor(shard);
if (inFlightShard) {
return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
return std::any_of(
teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); });
}
bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shard) {
if (!allServers.count(serverId))
return false;
// check serverKeys
auto& mss = allServers.at(serverId);
if (!mss.allShardStatusEqual(shard, MockShardStatus::INFLIGHT)) {
return false;
}
// check keyServers
auto teams = shardMapping->getTeamsFor(shard);
return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) {
return team.hasServer(serverId);
});
}
bool MockGlobalState::allShardRemovedFromServer(const UID& serverId) {
return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0;
}
TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 3;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
for (int i = 1; i <= dbConfig.storageTeamSize; ++i) {
auto id = MockGlobalState::indexToUID(i);
std::cout << "Check server " << i << "\n";
ASSERT(mgs->serverIsSourceForShard(id, allKeys));
ASSERT(mgs->allServers.at(id).sumRangeSize(allKeys) == 0);
}
return Void();
}
struct MockGlobalStateTester {
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, x2), [x2, r0.end)
void testThreeWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.ranges().begin();
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
MockShardStatus oldStatus = it.cvalue().status;
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
Key x2 = keyAfter(x1);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.threeWayShardSplitting(outerRange, KeyRangeRef(x1, x2), oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, x2));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x2, outerRange.end));
ranges.pop_front();
ASSERT(ranges.empty());
}
// expectation [r0.begin, r0.end) => [r0.begin, x1), [x1, r0.end)
void testTwoWaySplitFirstRange(MockStorageServer& mss) {
auto it = mss.serverKeys.nthRange(0);
MockShardStatus oldStatus = it.cvalue().status;
uint64_t oldSize =
deterministicRandom()->randomInt(SERVER_KNOBS->MIN_SHARD_BYTES, std::numeric_limits<int>::max());
it->value().shardSize = oldSize;
KeyRangeRef outerRange = it->range();
Key x1 = keyAfter(it->range().begin);
std::cout << "it->range.begin: " << it->range().begin.toHexString() << " size: " << oldSize << "\n";
mss.twoWayShardSplitting(it->range(), x1, oldSize, false);
auto ranges = mss.serverKeys.containedRanges(outerRange);
ASSERT(ranges.begin().range() == KeyRangeRef(outerRange.begin, x1));
ranges.pop_front();
ASSERT(ranges.begin().range() == KeyRangeRef(x1, outerRange.end));
ASSERT(ranges.begin().cvalue().status == oldStatus);
ranges.pop_front();
ASSERT(ranges.empty());
}
};
TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") {
BasicTestConfig testConfig;
testConfig.simpleConfig = true;
testConfig.minimumReplication = 1;
testConfig.logAntiQuorum = 0;
DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig);
TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString());
auto mgs = std::make_shared<MockGlobalState>();
mgs->initializeAsEmptyDatabaseMGS(dbConfig);
MockGlobalStateTester tester;
auto& mss = mgs->allServers.at(MockGlobalState::indexToUID(1));
std::cout << "Test 3-way splitting...\n";
tester.testThreeWaySplitFirstRange(mss);
std::cout << "Test 2-way splitting...\n";
mss.serverKeys.insert(allKeys, { MockShardStatus::COMPLETED, 0 }); // reset to empty
tester.testTwoWaySplitFirstRange(mss);
return Void();
}

View File

@ -447,10 +447,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
"Restored");
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; });
@ -1399,26 +1399,26 @@ ACTOR Future<Void> tLogCore(TLogData* self, Reference<LogData> logData) {
logData->addActor.send(waitFailureServer(logData->tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput));

View File

@ -533,10 +533,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
context);
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2212,26 +2212,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));

View File

@ -616,10 +616,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
context);
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2671,26 +2671,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));

View File

@ -35,7 +35,7 @@
#include "fdbserver/ResolverInterface.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
@ -188,7 +188,7 @@ struct Resolver : ReferenceCounted<Resolver> {
specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); });
specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); });
logger = traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ResolverMetrics");
logger = cc.traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ResolverMetrics");
}
~Resolver() { destroyConflictSet(conflictSet); }
};

View File

@ -30,7 +30,7 @@
#include "fdbserver/RestoreLoader.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -405,10 +405,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
.detail("Offset", asset.offset)
.detail("Length", asset.len);
// Ensure data blocks in the same file are processed in order
wait(processedFileOffset->whenAtLeast(asset.offset));
ASSERT(processedFileOffset->get() == asset.offset);
state Arena tempArena;
state StringRefReader reader(buf, restore_corrupted_data());
try {
@ -430,8 +426,9 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
const uint8_t* message = reader.consume(msgSize);
// Skip mutations out of the version range
if (!asset.isInVersionRange(msgVersion.version))
if (!asset.isInVersionRange(msgVersion.version)) {
continue;
}
state VersionedMutationsMap::iterator it;
bool inserted;
@ -452,6 +449,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
// Skip mutation whose commitVesion < range kv's version
if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) {
cc->oldLogMutations += 1;
wait(yield()); // avoid potential stack overflows
continue;
}
@ -459,6 +457,7 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
if (mutation.param1 >= asset.range.end ||
(isRangeMutation(mutation) && mutation.param2 < asset.range.begin) ||
(!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) {
wait(yield()); // avoid potential stack overflows
continue;
}
@ -509,7 +508,6 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
.detail("BlockLen", asset.len);
throw;
}
processedFileOffset->set(asset.offset + asset.len);
return Void();
}
@ -526,8 +524,19 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
state int readFileRetries = 0;
loop {
try {
// Ensure data blocks in the same file are processed in order
wait(processedFileOffset->whenAtLeast(asset.offset));
ASSERT(processedFileOffset->get() == asset.offset);
wait(_parsePartitionedLogFileOnLoader(
pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx));
processedFileOffset->set(asset.offset + asset.len);
TraceEvent("FastRestoreLoaderDecodingLogFileDone")
.detail("BatchIndex", asset.batchIndex)
.detail("Filename", asset.filename)
.detail("Offset", asset.offset)
.detail("Length", asset.len);
break;
} catch (Error& e) {
if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version ||

View File

@ -40,10 +40,16 @@ int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const {
}
std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) {
ShardsAffectedByTeamFailure::getTeamsForFirstShard(KeyRangeRef keys) {
return shard_teams[keys.begin];
}
std::pair<std::vector<ShardsAffectedByTeamFailure::Team>, std::vector<ShardsAffectedByTeamFailure::Team>>
ShardsAffectedByTeamFailure::getTeamsFor(KeyRef key) {
return shard_teams[key];
}
void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) {
DisabledTraceEvent(SevDebug, "ShardsAffectedByTeamFailureErase")
.detail("Range", range)
@ -236,3 +242,7 @@ void ShardsAffectedByTeamFailure::removeFailedServerForRange(KeyRangeRef keys, c
}
check();
}
auto ShardsAffectedByTeamFailure::intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges {
return shard_teams.intersectingRanges(keyRange);
}

View File

@ -166,8 +166,8 @@ public:
successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc),
snapshotRequest("SnapshotRequest", cc) {
cfi = getConfigFollowerInterface(configSource);
logger = traceCounters(
"ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigConsumerMetrics");
logger = cc.traceCounters(
"ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigConsumerMetrics");
}
Future<Void> consume(ConfigBroadcaster& broadcaster) {

View File

@ -248,9 +248,9 @@ public:
lastTLogVersion(0), lastVersionWithData(0), peekVersion(0), compactionInProgress(Void()),
fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), debug_inApplyUpdate(false),
debug_lastValidateTime(0), versionLag(0), behind(false), counters(this) {
version.initMetric("StorageCacheData.Version"_sr, counters.cc.id);
desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.id);
oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.id);
version.initMetric("StorageCacheData.Version"_sr, counters.cc.getId());
desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.getId());
newestAvailableVersion.insert(allKeys, invalidVersion);
newestDirtyVersion.insert(allKeys, invalidVersion);
@ -2224,11 +2224,10 @@ ACTOR Future<Void> storageCacheServer(StorageServerInterface ssi,
self.ck = cacheKeysPrefixFor(id).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/
actors.add(waitFailureServer(ssi.waitFailure.getFuture()));
actors.add(traceCounters("CacheMetrics",
self.thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&self.counters.cc,
self.thisServerID.toString() + "/CacheMetrics"));
actors.add(self.counters.cc.traceCounters("CacheMetrics",
self.thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
self.thisServerID.toString() + "/CacheMetrics"));
// fetch already cached ranges from the database and apply them before proceeding
wait(storageCacheStartUpWarmup(&self));

View File

@ -19,7 +19,7 @@
*/
#include "flow/UnitTest.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
int64_t StorageMetricSample::getEstimate(KeyRangeRef keys) const {

View File

@ -652,10 +652,10 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
context);
addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id()));
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id);
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id);
version.initMetric("TLog.Version"_sr, cc.id);
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id);
persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId());
persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId());
version.initMetric("TLog.Version"_sr, cc.getId());
queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId());
specialCounter(cc, "Version", [this]() { return this->version.get(); });
specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); });
@ -2930,26 +2930,26 @@ ACTOR Future<Void> tLogCore(TLogData* self,
logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture()));
logData->addActor.send(logData->removed);
// FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
logData->addActor.send(traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&logData->cc,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
logData->addActor.send(logData->cc.traceCounters("TLogMetrics",
logData->logId,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
logData->logId.toString() + "/TLogMetrics",
[self = self](TraceEvent& te) {
StorageBytes sbTlog = self->persistentData->getStorageBytes();
te.detail("KvstoreBytesUsed", sbTlog.used);
te.detail("KvstoreBytesFree", sbTlog.free);
te.detail("KvstoreBytesAvailable", sbTlog.available);
te.detail("KvstoreBytesTotal", sbTlog.total);
te.detail("KvstoreBytesTemp", sbTlog.temp);
StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
StorageBytes sbQueue =
self->rawPersistentQueue->getStorageBytes();
te.detail("QueueDiskBytesUsed", sbQueue.used);
te.detail("QueueDiskBytesFree", sbQueue.free);
te.detail("QueueDiskBytesAvailable", sbQueue.available);
te.detail("QueueDiskBytesTotal", sbQueue.total);
te.detail("QueueDiskBytesTemp", sbQueue.temp);
}));
logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));

View File

@ -122,19 +122,20 @@ public:
ACTOR static Future<Void> monitorStorageUsage(TenantCache* tenantCache) {
TraceEvent(SevInfo, "StartingTenantCacheStorageUsageMonitor", tenantCache->id()).log();
state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_REFRESH_INTERVAL;
state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL;
state double lastTenantListFetchTime = now();
loop {
state double fetchStartTime = now();
state std::vector<std::pair<KeyRef, TenantName>> tenantList = tenantCache->getTenantList();
state std::vector<TenantName> tenants = tenantCache->getTenantList();
state int i;
for (i = 0; i < tenantList.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenantList[i].second);
for (i = 0; i < tenants.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
tenantCache->updateStorageUsage(tenantList[i].first, size);
tenantCache->tenantStorageMap[tenants[i]].usage = size;
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
@ -149,6 +150,31 @@ public:
wait(delay(refreshInterval));
}
}
ACTOR static Future<Void> monitorStorageQuota(TenantCache* tenantCache) {
TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log();
state Transaction tr(tenantCache->dbcx());
loop {
loop {
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
for (auto const kv : currentQuotas) {
TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix);
int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[tenant].quota = quota;
}
tr.reset();
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
}
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
}
}
};
void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
@ -203,21 +229,14 @@ int TenantCache::cleanup() {
return tenantsRemoved;
}
std::vector<std::pair<KeyRef, TenantName>> TenantCache::getTenantList() const {
std::vector<std::pair<KeyRef, TenantName>> tenants;
std::vector<TenantName> TenantCache::getTenantList() const {
std::vector<TenantName> tenants;
for (const auto& [prefix, entry] : tenantCache) {
tenants.push_back({ prefix, entry->name() });
tenants.push_back(entry->name());
}
return tenants;
}
void TenantCache::updateStorageUsage(KeyRef prefix, int64_t size) {
auto it = tenantCache.find(prefix);
if (it != tenantCache.end()) {
it->value->updateStorageUsage(size);
}
}
std::string TenantCache::desc() const {
std::string s("@Generation: ");
s += std::to_string(generation) + " ";
@ -264,6 +283,16 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
return it->value;
}
std::vector<TenantName> TenantCache::getTenantsOverQuota() const {
std::vector<TenantName> tenants;
for (const auto& [tenant, storage] : tenantStorageMap) {
if (storage.usage > storage.quota) {
tenants.push_back(tenant);
}
}
return tenants;
}
Future<Void> TenantCache::monitorTenantMap() {
return TenantCacheImpl::monitorTenantMap(this);
}
@ -272,6 +301,10 @@ Future<Void> TenantCache::monitorStorageUsage() {
return TenantCacheImpl::monitorStorageUsage(this);
}
Future<Void> TenantCache::monitorStorageQuota() {
return TenantCacheImpl::monitorStorageQuota(this);
}
class TenantCacheUnitTest {
public:
ACTOR static Future<Void> InsertAndTestPresence() {

View File

@ -140,9 +140,27 @@ private:
Future<Void> collection;
};
// Defines granule info that interests full restore
struct BlobGranuleRestoreVersion {
// Two constructors required by VectorRef
BlobGranuleRestoreVersion() {}
BlobGranuleRestoreVersion(Arena& a, const BlobGranuleRestoreVersion& copyFrom)
: granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version),
sizeInBytes(copyFrom.sizeInBytes) {}
UID granuleID;
KeyRangeRef keyRange;
Version version;
int64_t sizeInBytes;
};
// Defines a vector for BlobGranuleVersion
typedef Standalone<VectorRef<BlobGranuleRestoreVersion>> BlobGranuleRestoreVersionVector;
ACTOR Future<Void> dumpManifest(Database db, Reference<BlobConnectionProvider> blobConn, int64_t epoch, int64_t seqNo);
ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
inline bool isFullRestoreMode() {
return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
};

View File

@ -30,23 +30,25 @@
struct BlobMigratorInterface {
constexpr static FileIdentifier file_identifier = 869199;
RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
RequestStream<ReplyPromise<Void>> waitFailure;
LocalityData locality;
UID uniqueID;
StorageServerInterface ssi;
BlobMigratorInterface() {}
BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {}
BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {
ssi.locality = l;
ssi.uniqueID = id;
}
void initEndpoints() {}
void initEndpoints() { ssi.initEndpoints(); }
UID id() const { return uniqueID; }
NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
NetworkAddress address() const { return haltBlobMigrator.getEndpoint().getPrimaryAddress(); }
bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); }
bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); }
template <class Archive>
void serialize(Archive& ar) {
// StorageServerInterface::serialize(ar);
serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID);
serializer(ar, locality, uniqueID, haltBlobMigrator);
}
};

View File

@ -289,11 +289,10 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME));
clusterRecoveryAvailableEventHolder = makeReference<EventCacheHolder>(
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_AVAILABLE_EVENT_NAME));
logger = traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
&cc,
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
logger = cc.traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME),
dbgid,
SERVER_KNOBS->WORKER_LOGGING_INTERVAL,
getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME));
if (forceRecovery && !controllerData->clusterControllerDcId.present()) {
TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
forceRecovery = false;

View File

@ -117,6 +117,7 @@ public:
virtual Future<Void> moveKeys(const MoveKeysParams& params) = 0;
// metrics.second is the number of key-ranges (i.e., shards) in the 'keys' key-range
virtual Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,

View File

@ -476,6 +476,8 @@ struct ShardSizeBounds {
bool operator==(ShardSizeBounds const& rhs) const {
return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError;
}
static ShardSizeBounds shardSizeBoundsBeforeTrack();
};
// Gets the permitted size and IO bounds for a shard
@ -484,10 +486,6 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);
// Determines the maximum shard size based on the size of the database
int64_t getMaxShardSize(double dbSizeEstimate);
struct StorageQuotaInfo {
std::map<Key, uint64_t> quotaMap;
};
#ifndef __INTEL_COMPILER
#pragma endregion
#endif

View File

@ -46,6 +46,7 @@ class GrvProxyTransactionTagThrottler {
: req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {}
void updateProxyTagThrottledDuration();
bool isMaxThrottled() const;
};
struct TagQueue {
@ -56,6 +57,8 @@ class GrvProxyTransactionTagThrottler {
explicit TagQueue(double rate) : rateInfo(rate) {}
void setRate(double rate);
bool isMaxThrottled() const;
void rejectRequests();
};
// Track the budgets for each tag
@ -69,8 +72,8 @@ public:
// If a request is ready to be executed, it is sent to the deque
// corresponding to its priority. If not, the request remains queued.
void releaseTransactions(double elapsed,
SpannedDeque<GetReadVersionRequest>& outBatchPriority,
SpannedDeque<GetReadVersionRequest>& outDefaultPriority);
Deque<GetReadVersionRequest>& outBatchPriority,
Deque<GetReadVersionRequest>& outDefaultPriority);
void addRequest(GetReadVersionRequest const&);

View File

@ -29,7 +29,7 @@
#include "fdbserver/IClosable.h"
#include "fdbserver/IPageEncryptionKeyProvider.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
struct CheckpointRequest {
const Version version; // The FDB version at which the checkpoint is created.

View File

@ -21,10 +21,11 @@
#ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H
#define FOUNDATIONDB_MOCKGLOBALSTATE_H
#include "StorageMetrics.h"
#include "StorageMetrics.actor.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/KeyLocationService.h"
#include "SimulatedCluster.h"
#include "ShardsAffectedByTeamFailure.h"
@ -51,9 +52,11 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) {
return false;
}
class MockStorageServer {
class MockStorageServer : public IStorageMetricsService {
friend struct MockGlobalStateTester;
ActorCollection actors;
public:
struct ShardInfo {
MockShardStatus status;
@ -73,8 +76,6 @@ public:
// size() and nthRange() would use the metrics as index instead
KeyRangeMap<ShardInfo> serverKeys;
// sampled metrics
StorageServerMetrics metrics;
CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;
StorageServerInterface ssi; // serve RPC requests
@ -103,6 +104,35 @@ public:
uint64_t sumRangeSize(KeyRangeRef range) const;
void addActor(Future<Void> future) override;
void getSplitPoints(SplitRangeRequest const& req) override;
Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) override;
void getStorageMetrics(const GetStorageMetricsRequest& req) override;
template <class Reply>
static constexpr bool isLoadBalancedReply = std::is_base_of_v<LoadBalancedReply, Reply>;
template <class Reply>
typename std::enable_if_t<isLoadBalancedReply<Reply>, void> sendErrorWithPenalty(const ReplyPromise<Reply>& promise,
const Error& err,
double penalty) {
Reply reply;
reply.error = err;
reply.penalty = penalty;
promise.send(reply);
}
template <class Reply>
typename std::enable_if_t<!isLoadBalancedReply<Reply>, void>
sendErrorWithPenalty(const ReplyPromise<Reply>& promise, const Error& err, double) {
promise.sendError(err);
}
Future<Void> run();
protected:
void threeWayShardSplitting(KeyRangeRef outerRange,
KeyRangeRef innerRange,
@ -112,8 +142,13 @@ protected:
void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize);
};
class MockGlobalState {
class MockGlobalStateImpl;
class MockGlobalState : public IKeyLocationService {
friend struct MockGlobalStateTester;
friend class MockGlobalStateImpl;
std::vector<StorageServerInterface> extractStorageServerInterfaces(const std::vector<UID>& ids) const;
public:
typedef ShardsAffectedByTeamFailure::Team Team;
@ -162,7 +197,37 @@ public:
* * mgs.shardMapping doesnt have any information about X
* * mgs.allServer[X] is existed
*/
bool allShardRemovedFromServer(const UID& serverId);
bool allShardsRemovedFromServer(const UID& serverId);
// SOMEDAY: NativeAPI::waitStorageMetrics should share the code in the future, this is a simpler version of it
Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(KeyRange const& keys,
StorageMetrics const& min,
StorageMetrics const& max,
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount);
Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(const KeyRange& keys,
const StorageMetrics& limit,
const StorageMetrics& estimated,
const Optional<int>& minSplitBytes);
Future<KeyRangeLocationInfo> getKeyLocation(TenantInfo tenant,
Key key,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Reverse isBackward,
Version version) override;
Future<std::vector<KeyRangeLocationInfo>> getKeyRangeLocations(TenantInfo tenant,
KeyRange keys,
int limit,
Reverse reverse,
SpanContext spanContext,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies,
Version version) override;
};
#endif // FOUNDATIONDB_MOCKGLOBALSTATE_H

View File

@ -156,7 +156,7 @@ struct ProxyStats {
specialCounter(cc, "NumTenants", [pTenantMap]() { return pTenantMap ? pTenantMap->size() : 0; });
specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); });
specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); });
logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics");
logger = cc.traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ProxyMetrics");
}
};

View File

@ -284,11 +284,11 @@ struct ApplierBatchData : public ReferenceCounted<ApplierBatchData> {
: vbState(ApplierVersionBatchState::NOT_INIT), receiveMutationReqs(0), receivedBytes(0), appliedBytes(0),
targetWriteRateMB(SERVER_KNOBS->FASTRESTORE_WRITE_BW_MB / SERVER_KNOBS->FASTRESTORE_NUM_APPLIERS),
totalBytesToWrite(-1), applyingDataBytes(0), counters(this, nodeID, batchIndex) {
pollMetrics = traceCounters(format("FastRestoreApplierMetrics%d", batchIndex),
nodeID,
SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
&counters.cc,
nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex));
pollMetrics =
counters.cc.traceCounters(format("FastRestoreApplierMetrics%d", batchIndex),
nodeID,
SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex));
TraceEvent("FastRestoreApplierMetricsCreated").detail("Node", nodeID);
}
~ApplierBatchData() {

View File

@ -93,11 +93,11 @@ struct LoaderBatchData : public ReferenceCounted<LoaderBatchData> {
explicit LoaderBatchData(UID nodeID, int batchIndex)
: vbState(LoaderVersionBatchState::NOT_INIT), loadFileReqs(0), counters(this, nodeID, batchIndex) {
pollMetrics = traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex),
nodeID,
SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
&counters.cc,
nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex));
pollMetrics =
counters.cc.traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex),
nodeID,
SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY,
nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex));
TraceEvent("FastRestoreLoaderMetricsCreated").detail("Node", nodeID);
}

View File

@ -80,8 +80,12 @@ public:
bool hasShards(Team team) const;
// The first element of the pair is either the source for non-moving shards or the destination team for in-flight
// shards The second element of the pair is all previous sources for in-flight shards
std::pair<std::vector<Team>, std::vector<Team>> getTeamsFor(KeyRangeRef keys);
// shards. The second element of the pair is all previous sources for in-flight shards. This function only returns
// the teams for the first shard in [keys.begin, keys.end)
std::pair<std::vector<Team>, std::vector<Team>> getTeamsForFirstShard(KeyRangeRef keys);
std::pair<std::vector<Team>, std::vector<Team>> getTeamsFor(KeyRef key);
// Shard boundaries are modified in defineShard and the content of what servers correspond to each shard is a copy
// or union of the shards already there
void defineShard(KeyRangeRef keys);
@ -124,6 +128,7 @@ private:
public:
// return the iterator that traversing all ranges
auto getAllRanges() const -> decltype(shard_teams)::ConstRanges;
auto intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges;
// get total shards count
size_t getNumberOfShards() const;
void removeFailedServerForRange(KeyRangeRef keys, const UID& serverID);

View File

@ -19,13 +19,18 @@
*/
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_STORAGEMETRICS_G_H)
#define FDBSERVER_STORAGEMETRICS_G_H
#include "fdbserver/StorageMetrics.actor.g.h"
#elif !defined(FDBSERVER_STORAGEMETRICS_H)
#define FDBSERVER_STORAGEMETRICS_H
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/simulator.h"
#include "flow/UnitTest.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h"
const StringRef STORAGESERVER_HISTOGRAM_GROUP = "StorageServer"_sr;
const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = "FetchKeysLatency"_sr;
@ -152,3 +157,76 @@ struct ByteSampleInfo {
// Determines whether a key-value pair should be included in a byte sample
// Also returns size information about the sample
ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue);
class IStorageMetricsService {
public:
StorageServerMetrics metrics;
// penalty used by loadBalance() to balance requests among service instances
virtual double getPenalty() const { return 1; }
virtual bool isReadable(KeyRangeRef const& keys) const { return true; }
virtual void addActor(Future<Void> future) = 0;
virtual void getSplitPoints(SplitRangeRequest const& req) = 0;
virtual Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) = 0;
virtual void getStorageMetrics(const GetStorageMetricsRequest& req) = 0;
// NOTE: also need to have this function but template can't be a virtual so...
// template <class Reply>
// void sendErrorWithPenalty(const ReplyPromise<Reply>& promise, const Error& err, double penalty);
};
ACTOR template <class ServiceType>
Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterface ssi) {
state Future<Void> doPollMetrics = Void();
loop {
choose {
when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
if (!req.tenantInfo.present() && !self->isReadable(req.keys)) {
CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()");
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->addActor(self->waitMetricsTenantAware(req));
}
}
when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
if (!self->isReadable(req.keys)) {
CODE_PROBE(true, "splitMetrics immediate wrong_shard_server()");
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->metrics.splitMetrics(req);
}
}
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
self->getStorageMetrics(req);
}
when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
if (!self->isReadable(req.keys)) {
CODE_PROBE(true, "readHotSubRanges immediate wrong_shard_server()", probe::decoration::rare);
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->metrics.getReadHotRanges(req);
}
}
when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
if (!self->isReadable(req.keys)) {
CODE_PROBE(true, "getSplitPoints immediate wrong_shard_server()");
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->getSplitPoints(req);
}
}
when(wait(doPollMetrics)) {
self->metrics.poll();
doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY);
}
}
}
}
#include "flow/unactorcompiler.h"
#endif // FDBSERVER_STORAGEMETRICS_H

View File

@ -268,5 +268,4 @@ public:
void removeTeam(TCTeamInfo team);
void updateCacheGeneration(int64_t generation) { m_cacheGeneration = generation; }
int64_t cacheGeneration() const { return m_cacheGeneration; }
void updateStorageUsage(int64_t size) { m_tenantInfo.storageUsage = size; }
};

View File

@ -32,6 +32,12 @@
typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;
struct Storage {
int64_t quota = std::numeric_limits<int64_t>::max();
int64_t usage = 0;
};
typedef std::unordered_map<TenantName, Storage> TenantStorageMap;
struct TenantCacheTenantCreated {
KeyRange keys;
Promise<bool> reply;
@ -50,6 +56,9 @@ private:
uint64_t generation;
TenantMapByPrefix tenantCache;
// Map from tenant names to storage quota and usage
TenantStorageMap tenantStorageMap;
// mark the start of a new sweep of the tenant cache
void startRefresh();
@ -62,11 +71,8 @@ private:
// return count of tenants that were found to be stale and removed from the cache
int cleanup();
// return the mapping from prefix -> tenant name for all tenants stored in the cache
std::vector<std::pair<KeyRef, TenantName>> getTenantList() const;
// update the size for a tenant; do nothing if the tenant doesn't exist in the map
void updateStorageUsage(KeyRef prefix, int64_t size);
// return all the TenantName for all tenants stored in the cache
std::vector<TenantName> getTenantList() const;
UID id() const { return distributorID; }
@ -85,9 +91,14 @@ public:
Future<Void> monitorStorageUsage();
Future<Void> monitorStorageQuota();
std::string desc() const;
bool isTenantKey(KeyRef key) const;
Optional<Reference<TCTenantInfo>> tenantOwning(KeyRef key) const;
// Get the list of tenants where the storage bytes currently used is greater than the quota allocated
std::vector<TenantName> getTenantsOverQuota() const;
};

View File

@ -114,7 +114,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
addActor(addActor) {
logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics");
logger = cc.traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "MasterMetrics");
if (forceRecovery && !myInterface.locality.dcId().present()) {
TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log();
forceRecovery = false;

View File

@ -81,11 +81,12 @@
#include "fdbserver/ServerCheckpoint.actor.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/SpanContextMessage.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/TransactionTagCounter.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "flow/ActorCollection.h"
#include "flow/Arena.h"
#include "flow/Error.h"
@ -641,7 +642,7 @@ struct BusiestWriteTagContext {
busiestWriteTagEventHolder(makeReference<EventCacheHolder>(busiestWriteTagTrackingKey)), lastUpdateTime(-1) {}
};
struct StorageServer {
struct StorageServer : public IStorageMetricsService {
typedef VersionedMap<KeyRef, ValueOrClearToRef> VersionedData;
private:
@ -807,8 +808,8 @@ public:
VersionedData const& data() const { return versionedData; }
VersionedData& mutableData() { return versionedData; }
double old_rate = 1.0;
double currentRate() {
mutable double old_rate = 1.0;
double currentRate() const {
auto versionLag = version.get() - durableVersion.get();
double res;
if (versionLag >= SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX) {
@ -988,7 +989,6 @@ public:
Database cx;
ActorCollection actors;
StorageServerMetrics metrics;
CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;
AsyncVar<bool> byteSampleClearsTooLarge;
Future<Void> byteSampleRecovery;
@ -1308,10 +1308,10 @@ public:
storageServerSourceTLogIDEventHolder(
makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) {
version.initMetric("StorageServer.Version"_sr, counters.cc.id);
oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.id);
durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.id);
desiredOldestVersion.initMetric("StorageServer.DesiredOldestVersion"_sr, counters.cc.id);
version.initMetric("StorageServer.Version"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId());
durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.getId());
desiredOldestVersion.initMetric("StorageServer.DesiredOldestVersion"_sr, counters.cc.getId());
newestAvailableVersion.insert(allKeys, invalidVersion);
newestDirtyVersion.insert(allKeys, invalidVersion);
@ -1380,7 +1380,7 @@ public:
// This is the maximum version that might be read from storage (the minimum version is durableVersion)
Version storageVersion() const { return oldestVersion.get(); }
bool isReadable(KeyRangeRef const& keys) {
bool isReadable(KeyRangeRef const& keys) const override {
auto sh = shards.intersectingRanges(keys);
for (auto i = sh.begin(); i != sh.end(); ++i)
if (!i->value()->isReadable())
@ -1406,10 +1406,10 @@ public:
}
}
Counter::Value queueSize() { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); }
Counter::Value queueSize() const { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); }
// penalty used by loadBalance() to balance requests among SSes. We prefer SS with less write queue size.
double getPenalty() {
double getPenalty() const override {
return std::max(std::max(1.0,
(queueSize() - (SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER -
2.0 * SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER)) /
@ -1503,7 +1503,7 @@ public:
}
}
void getSplitPoints(SplitRangeRequest const& req) {
void getSplitPoints(SplitRangeRequest const& req) override {
try {
Optional<TenantMapEntry> entry = getTenantEntry(version.get(), req.tenantInfo);
metrics.getSplitPoints(req, entry.map<Key>([](TenantMapEntry e) { return e.prefix; }));
@ -1533,6 +1533,15 @@ public:
}
return false;
}
Future<Void> waitMetricsTenantAware(const WaitMetricsRequest& req) override;
void addActor(Future<Void> future) override { actors.add(future); }
void getStorageMetrics(const GetStorageMetricsRequest& req) override {
StorageBytes sb = storage.getStorageBytes();
metrics.getStorageMetrics(req, sb, counters.bytesInput.getRate(), versionLag, lastUpdate);
}
};
const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = ""_sr;
@ -5976,27 +5985,26 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
Reference<BlobConnectionProvider> blobConn) {
ASSERT(blobConn.isValid());
try {
state Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tryReadBlobGranules(tr, keys, fetchVersion));
if (chunks.size() == 0) {
throw blob_granule_transaction_too_old(); // no data on blob
}
if (!isRangeFullyCovered(keys, chunks)) {
throw blob_granule_transaction_too_old();
}
for (const BlobGranuleChunkRef& chunk : chunks) {
state KeyRangeRef chunkRange = chunk.keyRange;
state RangeResult rows = wait(readBlobGranule(chunk, keys, 0, fetchVersion, blobConn));
state int i;
for (i = 0; i < chunks.size(); ++i) {
state KeyRangeRef chunkRange = chunks[i].keyRange;
state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn));
TraceEvent("ReadBlobData")
.detail("Rows", rows.size())
.detail("ChunkRange", chunkRange.toString())
.detail("Keys", keys.toString());
if (rows.size() == 0) {
rows.readThrough = KeyRef(rows.arena(), chunkRange.end);
rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end));
}
if (i == chunks.size() - 1) {
rows.readThrough = KeyRef(rows.arena(), keys.end);
}
results.send(rows);
}
@ -6010,7 +6018,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
tr->reset();
tr->setVersion(fetchVersion);
tr->trState->taskID = TaskPriority::FetchKeys;
wait(tryGetRange(results, tr, keys)); // fail back to storage server
throw;
}
return Void();
}
@ -6798,8 +6806,10 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
// We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure
// change feed mutations get applied correctly
state std::vector<Key> changeFeedsToFetch;
std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
changeFeedsToFetch = _cfToFetch;
if (!isFullRestoreMode()) {
std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
changeFeedsToFetch = _cfToFetch;
}
wait(data->durableVersionLock.take());
shard->phase = AddingShard::Fetching;
@ -10166,7 +10176,7 @@ Future<Void> StorageServerMetrics::waitMetrics(WaitMetricsRequest req, Future<Vo
#pragma region Core
#endif
ACTOR Future<Void> waitMetricsTenantAware(StorageServer* self, WaitMetricsRequest req) {
ACTOR Future<Void> waitMetricsTenantAware_internal(StorageServer* self, WaitMetricsRequest req) {
if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) {
wait(success(waitForVersionNoTooOld(self, latestVersion)));
Optional<TenantMapEntry> entry = self->getTenantEntry(latestVersion, req.tenantInfo.get());
@ -10184,85 +10194,45 @@ ACTOR Future<Void> waitMetricsTenantAware(StorageServer* self, WaitMetricsReques
return Void();
}
Future<Void> StorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) {
return waitMetricsTenantAware_internal(this, req);
}
ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi) {
state Future<Void> doPollMetrics = Void();
wait(self->byteSampleRecovery);
TraceEvent("StorageServerRestoreDurableState", self->thisServerID).detail("RestoredBytes", self->bytesRestored);
// Logs all counters in `counters.cc` and reset the interval.
self->actors.add(traceCounters("StorageMetrics",
self->thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
&self->counters.cc,
self->thisServerID.toString() + "/StorageMetrics",
[self = self](TraceEvent& te) {
te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
te.detail("Tag", self->tag.toString());
StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free);
te.detail("KvstoreBytesAvailable", sb.available);
te.detail("KvstoreBytesTotal", sb.total);
te.detail("KvstoreBytesTemp", sb.temp);
if (self->isTss()) {
te.detail("TSSPairID", self->tssPairID);
te.detail("TSSJointID",
UID(self->thisServerID.first() ^ self->tssPairID.get().first(),
self->thisServerID.second() ^ self->tssPairID.get().second()));
} else if (self->isSSWithTSSPair()) {
te.detail("SSPairID", self->ssPairID);
te.detail("TSSJointID",
UID(self->thisServerID.first() ^ self->ssPairID.get().first(),
self->thisServerID.second() ^ self->ssPairID.get().second()));
}
}));
self->actors.add(self->counters.cc.traceCounters(
"StorageMetrics",
self->thisServerID,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
self->thisServerID.toString() + "/StorageMetrics",
[self = self](TraceEvent& te) {
te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
te.detail("Tag", self->tag.toString());
StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free);
te.detail("KvstoreBytesAvailable", sb.available);
te.detail("KvstoreBytesTotal", sb.total);
te.detail("KvstoreBytesTemp", sb.temp);
if (self->isTss()) {
te.detail("TSSPairID", self->tssPairID);
te.detail("TSSJointID",
UID(self->thisServerID.first() ^ self->tssPairID.get().first(),
self->thisServerID.second() ^ self->tssPairID.get().second()));
} else if (self->isSSWithTSSPair()) {
te.detail("SSPairID", self->ssPairID);
te.detail("TSSJointID",
UID(self->thisServerID.first() ^ self->ssPairID.get().first(),
self->thisServerID.second() ^ self->ssPairID.get().second()));
}
}));
loop {
choose {
when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
if (!req.tenantInfo.present() && !self->isReadable(req.keys)) {
CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()");
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->actors.add(waitMetricsTenantAware(self, req));
}
}
when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
if (!self->isReadable(req.keys)) {
CODE_PROBE(true, "splitMetrics immediate wrong_shard_server()");
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->metrics.splitMetrics(req);
}
}
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
StorageBytes sb = self->storage.getStorageBytes();
self->metrics.getStorageMetrics(
req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate);
}
when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
if (!self->isReadable(req.keys)) {
CODE_PROBE(true, "readHotSubRanges immediate wrong_shard_server()", probe::decoration::rare);
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->metrics.getReadHotRanges(req);
}
}
when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) {
if (!self->isReadable(req.keys)) {
CODE_PROBE(true, "getSplitPoints immediate wrong_shard_server()");
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->getSplitPoints(req);
}
}
when(wait(doPollMetrics)) {
self->metrics.poll();
doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY);
}
}
}
wait(serveStorageMetricsRequests(self, ssi));
return Void();
}
ACTOR Future<Void> logLongByteSampleRecovery(Future<Void> recovery) {

View File

@ -2267,7 +2267,25 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
CODE_PROBE(true, "Recruited while already a blob migrator.");
} else {
startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.haltBlobMigrator);
DUMPTOKEN(recruited.ssi.getValue);
DUMPTOKEN(recruited.ssi.getKey);
DUMPTOKEN(recruited.ssi.getKeyValues);
DUMPTOKEN(recruited.ssi.getMappedKeyValues);
DUMPTOKEN(recruited.ssi.getShardState);
DUMPTOKEN(recruited.ssi.waitMetrics);
DUMPTOKEN(recruited.ssi.splitMetrics);
DUMPTOKEN(recruited.ssi.getReadHotRanges);
DUMPTOKEN(recruited.ssi.getRangeSplitPoints);
DUMPTOKEN(recruited.ssi.getStorageMetrics);
DUMPTOKEN(recruited.ssi.waitFailure);
DUMPTOKEN(recruited.ssi.getQueuingMetrics);
DUMPTOKEN(recruited.ssi.getKeyValueStoreType);
DUMPTOKEN(recruited.ssi.watchValue);
DUMPTOKEN(recruited.ssi.getKeyValuesStream);
DUMPTOKEN(recruited.ssi.changeFeedStream);
DUMPTOKEN(recruited.ssi.changeFeedPop);
DUMPTOKEN(recruited.ssi.changeFeedVersionUpdate);
Future<Void> blobMigratorProcess = blobMigrator(recruited, dbInfo);
errorForwarders.add(forwardError(errors,

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbserver/Knobs.h"
@ -95,6 +96,7 @@ struct AtomicRestoreWorkload : TestWorkload {
TraceEvent("AtomicRestore_Start").detail("UsePartitionedLog", self->usePartitionedLogs);
state std::string backupContainer = "file://simfdb/backups/";
state DatabaseConfiguration conf = wait(getDatabaseConfiguration(cx));
try {
wait(backupAgent.submitBackup(cx,
StringRef(backupContainer),
@ -103,7 +105,8 @@ struct AtomicRestoreWorkload : TestWorkload {
deterministicRandom()->randomInt(0, 100),
BackupAgentBase::getDefaultTagName(),
self->backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
conf.tenantMode != TenantMode::OPTIONAL_TENANT,
StopWhenDone::False,
self->usePartitionedLogs));
} catch (Error& e) {

View File

@ -215,7 +215,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
state std::string backupContainer = "file://simfdb/backups/";
state Future<Void> status = statusLoop(cx, tag.toString());
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
try {
wait(backupAgent->submitBackup(cx,
StringRef(backupContainer),
@ -224,7 +224,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
deterministicRandom()->randomInt(0, 100),
tag.toString(),
backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
StopWhenDone{ !stopDifferentialDelay },
self->usePartitionedLogs));
} catch (Error& e) {
@ -474,6 +475,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
// Occasionally start yet another backup that might still be running when we restore
if (!self->locked && BUGGIFY) {
TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag));
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
try {
// Note the "partitionedLog" must be false, because we change
// the configuration to disable backup workers before restore.
@ -484,7 +486,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
deterministicRandom()->randomInt(0, 100),
self->backupTag.toString(),
self->backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
StopWhenDone::True,
UsePartitionedLog::False);
} catch (Error& e) {

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
@ -331,7 +332,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
state std::string backupContainer = "file://simfdb/backups/";
state Future<Void> status = statusLoop(cx, tag.toString());
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
try {
wait(backupAgent->submitBackup(cx,
StringRef(backupContainer),
@ -340,7 +341,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
deterministicRandom()->randomInt(0, 2000),
tag.toString(),
backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
StopWhenDone{ !stopDifferentialDelay },
UsePartitionedLog::False,
IncrementalBackupOnly::False,
@ -515,6 +517,42 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
return Void();
}
ACTOR static Future<Void> clearAndRestoreSystemKeys(Database cx,
BackupAndRestoreCorrectnessWorkload* self,
FileBackupAgent* backupAgent,
Version targetVersion,
Reference<IBackupContainer> lastBackupContainer,
Standalone<VectorRef<KeyRangeRef>> systemRestoreRanges) {
// restore system keys before restoring any other ranges
wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
for (auto& range : systemRestoreRanges)
tr->clear(range);
return Void();
}));
state Standalone<StringRef> restoreTag(self->backupTag.toString() + "_system");
printf("BackupCorrectness, backupAgent.restore is called for tag:%s\n", restoreTag.toString().c_str());
wait(success(backupAgent->restore(cx,
cx,
restoreTag,
KeyRef(lastBackupContainer->getURL()),
lastBackupContainer->getProxy(),
systemRestoreRanges,
WaitForComplete::True,
targetVersion,
Verbose::True,
Key(),
Key(),
self->locked,
UnlockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
self->encryptionKeyFileName)));
printf("BackupCorrectness, backupAgent.restore finished for tag:%s\n", restoreTag.toString().c_str());
return Void();
}
ACTOR static Future<Void> _start(Database cx, BackupAndRestoreCorrectnessWorkload* self) {
state FileBackupAgent backupAgent;
state Future<Void> extraBackup;
@ -593,6 +631,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
// Occasionally start yet another backup that might still be running when we restore
if (!self->locked && BUGGIFY) {
TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag));
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
try {
extraBackup = backupAgent.submitBackup(cx,
"file://simfdb/backups/"_sr,
@ -601,7 +640,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
deterministicRandom()->randomInt(0, 100),
self->backupTag.toString(),
self->backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
StopWhenDone::True);
} catch (Error& e) {
TraceEvent("BARW_SubmitBackup2Exception", randomID)
@ -638,7 +678,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
lastBackupContainer->getEncryptionKeyFileName());
BackupDescription desc = wait(container->describeBackup());
Version targetVersion = -1;
state Version targetVersion = -1;
if (desc.maxRestorableVersion.present()) {
if (deterministicRandom()->random01() < 0.1) {
targetVersion = desc.minRestorableVersion.get();
@ -656,6 +696,32 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
state std::vector<Standalone<StringRef>> restoreTags;
state bool multipleRangesInOneTag = false;
state int restoreIndex = 0;
// make sure system keys are not present in the restoreRanges as they will get restored first separately
// from the rest
Standalone<VectorRef<KeyRangeRef>> modifiedRestoreRanges;
Standalone<VectorRef<KeyRangeRef>> systemRestoreRanges;
for (int i = 0; i < self->restoreRanges.size(); ++i) {
if (!SERVER_KNOBS->ENABLE_ENCRYPTION ||
!self->restoreRanges[i].intersects(getSystemBackupRanges())) {
modifiedRestoreRanges.push_back_deep(modifiedRestoreRanges.arena(), self->restoreRanges[i]);
} else {
KeyRangeRef normalKeyRange = self->restoreRanges[i] & normalKeys;
KeyRangeRef systemKeyRange = self->restoreRanges[i] & systemKeys;
if (!normalKeyRange.empty()) {
modifiedRestoreRanges.push_back_deep(modifiedRestoreRanges.arena(), normalKeyRange);
}
if (!systemKeyRange.empty()) {
systemRestoreRanges.push_back_deep(systemRestoreRanges.arena(), systemKeyRange);
}
}
}
self->restoreRanges = modifiedRestoreRanges;
if (!systemRestoreRanges.empty()) {
// We are able to restore system keys first since we restore an entire cluster at once rather than
// partial key ranges.
wait(clearAndRestoreSystemKeys(
cx, self, &backupAgent, targetVersion, lastBackupContainer, systemRestoreRanges));
}
if (deterministicRandom()->random01() < 0.5) {
for (restoreIndex = 0; restoreIndex < self->restoreRanges.size(); restoreIndex++) {
auto range = self->restoreRanges[restoreIndex];
@ -703,6 +769,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
Key(),
Key(),
self->locked,
UnlockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
@ -735,6 +802,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
Key(),
Key(),
self->locked,
UnlockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,

View File

@ -21,6 +21,7 @@
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/BlobStoreWorkload.h"
#include "fdbserver/workloads/workloads.actor.h"
@ -57,6 +58,7 @@ struct BackupToBlobWorkload : TestWorkload {
addDefaultBackupRanges(backupRanges);
wait(delay(self->backupAfter));
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
wait(backupAgent.submitBackup(cx,
self->backupURL,
{},
@ -64,7 +66,8 @@ struct BackupToBlobWorkload : TestWorkload {
self->snapshotInterval,
self->backupTag.toString(),
backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION));
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT));
EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), StopWhenDone::True));
TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus));
return Void();

View File

@ -22,6 +22,7 @@
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "flow/ApiVersion.h"
@ -667,10 +668,47 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
// wait(diffRanges(self->backupRanges, self->backupPrefix, cx, self->extraDB));
state Standalone<VectorRef<KeyRangeRef>> restoreRange;
state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
for (auto r : self->backupRanges) {
restoreRange.push_back_deep(
restoreRange.arena(),
KeyRangeRef(r.begin.withPrefix(self->backupPrefix), r.end.withPrefix(self->backupPrefix)));
if (!SERVER_KNOBS->ENABLE_ENCRYPTION || !r.intersects(getSystemBackupRanges())) {
restoreRange.push_back_deep(
restoreRange.arena(),
KeyRangeRef(r.begin.withPrefix(self->backupPrefix), r.end.withPrefix(self->backupPrefix)));
} else {
KeyRangeRef normalKeyRange = r & normalKeys;
KeyRangeRef systemKeyRange = r & systemKeys;
if (!normalKeyRange.empty()) {
restoreRange.push_back_deep(restoreRange.arena(),
KeyRangeRef(normalKeyRange.begin.withPrefix(self->backupPrefix),
normalKeyRange.end.withPrefix(self->backupPrefix)));
}
if (!systemKeyRange.empty()) {
systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
}
}
}
// restore system keys first before restoring user data
if (!systemRestoreRange.empty()) {
state Key systemRestoreTag = "restore_system"_sr;
try {
wait(restoreTool.submitBackup(cx,
systemRestoreTag,
systemRestoreRange,
StopWhenDone::True,
StringRef(),
self->backupPrefix,
self->locked,
DatabaseBackupAgent::PreBackupAction::CLEAR));
} catch (Error& e) {
TraceEvent("BARW_DoBackupSubmitBackupException", randomID)
.error(e)
.detail("Tag", printable(systemRestoreTag));
if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate)
throw;
}
wait(success(restoreTool.waitBackup(cx, systemRestoreTag)));
wait(restoreTool.unlockBackup(cx, systemRestoreTag));
}
try {

View File

@ -105,16 +105,6 @@ struct BlobGranuleRangesWorkload : TestWorkload {
}
}
ACTOR Future<bool> setRange(Database cx, KeyRange range, bool active, Optional<TenantName> tenantName) {
if (active) {
bool success = wait(cx->blobbifyRange(range, tenantName));
return success;
} else {
bool success = wait(cx->unblobbifyRange(range, tenantName));
return success;
}
}
ACTOR Future<Void> registerNewRange(Database cx, BlobGranuleRangesWorkload* self, Optional<TenantName> tenantName) {
std::string nextRangeKey = "R_" + self->newKey();
state KeyRange range(KeyRangeRef(StringRef(nextRangeKey), strinc(StringRef(nextRangeKey))));
@ -124,8 +114,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
// don't put in active ranges until AFTER set range command succeeds, to avoid checking a range that maybe
// wasn't initialized
bool success =
wait(self->setRange(cx, range, true, tenantName.present() ? tenantName.get() : self->tenantName));
bool success = wait(cx->blobbifyRange(range, tenantName.present() ? tenantName.get() : self->tenantName));
ASSERT(success);
if (BGRW_DEBUG) {
@ -163,7 +152,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
Key purgeKey = wait(self->versionedForcePurge(cx, range, self->tenantName));
wait(cx->waitPurgeGranulesComplete(purgeKey));
}
bool success = wait(self->setRange(cx, range, false, self->tenantName));
bool success = wait(cx->unblobbifyRange(range, self->tenantName));
ASSERT(success);
if (BGRW_DEBUG) {
@ -356,7 +345,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
// tear down range at end
Key purgeKey = wait(self->versionedForcePurge(cx, range, self->tenantName));
wait(cx->waitPurgeGranulesComplete(purgeKey));
bool success = wait(self->setRange(cx, range, false, self->tenantName));
bool success = wait(cx->unblobbifyRange(range, self->tenantName));
ASSERT(success);
if (BGRW_DEBUG) {
@ -373,7 +362,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
if (BGRW_DEBUG) {
fmt::print("VerifyRangeUnit: [{0} - {1})\n", range.begin.printable(), range.end.printable());
}
bool setSuccess = wait(self->setRange(cx, activeRange, true, self->tenantName));
bool setSuccess = wait(cx->blobbifyRange(activeRange, self->tenantName));
ASSERT(setSuccess);
wait(self->checkRange(cx, self, activeRange, true));
@ -426,7 +415,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
for (i = 0; i < rangeCount; i++) {
state KeyRange subRange(KeyRangeRef(boundaries[i], boundaries[i + 1]));
if (i != rangeToNotBlobbify) {
bool setSuccess = wait(self->setRange(cx, subRange, true, self->tenantName));
bool setSuccess = wait(cx->blobbifyRange(subRange, self->tenantName));
ASSERT(setSuccess);
wait(self->checkRange(cx, self, subRange, true));
} else {
@ -473,7 +462,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
}
ACTOR Future<Void> rangesMisalignedUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
bool setSuccess = wait(self->setRange(cx, range, true, self->tenantName));
bool setSuccess = wait(cx->blobbifyRange(range, self->tenantName));
ASSERT(setSuccess);
state KeyRange subRange(KeyRangeRef(range.begin.withSuffix("A"_sr), range.begin.withSuffix("B"_sr)));
@ -526,42 +515,42 @@ struct BlobGranuleRangesWorkload : TestWorkload {
// unblobbifying range that already doesn't exist should be no-op
if (deterministicRandom()->coinflip()) {
bool unblobbifyStartSuccess = wait(self->setRange(cx, activeRange, false, self->tenantName));
bool unblobbifyStartSuccess = wait(cx->blobbifyRange(activeRange, self->tenantName));
ASSERT(unblobbifyStartSuccess);
}
bool success = wait(self->setRange(cx, activeRange, true, self->tenantName));
bool success = wait(cx->blobbifyRange(activeRange, self->tenantName));
ASSERT(success);
wait(self->checkRange(cx, self, activeRange, true));
// check that re-blobbifying same range is successful
bool retrySuccess = wait(self->setRange(cx, activeRange, true, self->tenantName));
bool retrySuccess = wait(cx->blobbifyRange(activeRange, self->tenantName));
ASSERT(retrySuccess);
wait(self->checkRange(cx, self, activeRange, true));
// check that blobbifying range that overlaps but does not match existing blob range fails
bool fail1 = wait(self->setRange(cx, range, true, self->tenantName));
bool fail1 = wait(cx->blobbifyRange(range, self->tenantName));
ASSERT(!fail1);
bool fail2 = wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), true, self->tenantName));
bool fail2 = wait(cx->blobbifyRange(KeyRangeRef(range.begin, activeRange.end), self->tenantName));
ASSERT(!fail2);
bool fail3 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), true, self->tenantName));
bool fail3 = wait(cx->blobbifyRange(KeyRangeRef(activeRange.begin, range.end), self->tenantName));
ASSERT(!fail3);
bool fail4 = wait(self->setRange(cx, KeyRangeRef(range.begin, middleKey), true, self->tenantName));
bool fail4 = wait(cx->blobbifyRange(KeyRangeRef(range.begin, middleKey), self->tenantName));
ASSERT(!fail4);
bool fail5 = wait(self->setRange(cx, KeyRangeRef(middleKey, range.end), true, self->tenantName));
bool fail5 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, range.end), self->tenantName));
ASSERT(!fail5);
bool fail6 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), true, self->tenantName));
bool fail6 = wait(cx->blobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName));
ASSERT(!fail6);
bool fail7 = wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), true, self->tenantName));
bool fail7 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName));
ASSERT(!fail7);
bool fail8 = wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), true, self->tenantName));
bool fail8 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, middleKey2), self->tenantName));
ASSERT(!fail8);
{
@ -582,13 +571,14 @@ struct BlobGranuleRangesWorkload : TestWorkload {
}
}
// tear down + check that un-blobbifying at a non-aligned range also doesn't work
Key purgeKey = wait(self->versionedForcePurge(cx, activeRange, self->tenantName));
state Version purgeVersion = deterministicRandom()->coinflip() ? latestVersion : 1;
state KeyRangeRef purgeRange = deterministicRandom()->coinflip() ? activeRange : range;
Key purgeKey = wait(cx->purgeBlobGranules(purgeRange, purgeVersion, self->tenantName, true));
wait(cx->waitPurgeGranulesComplete(purgeKey));
if (deterministicRandom()->coinflip()) {
// force purge again and ensure it is idempotent
Key purgeKeyAgain = wait(cx->purgeBlobGranules(activeRange, 1, self->tenantName, true));
Key purgeKeyAgain = wait(cx->purgeBlobGranules(purgeRange, purgeVersion, self->tenantName, true));
wait(cx->waitPurgeGranulesComplete(purgeKeyAgain));
}
}
@ -600,41 +590,38 @@ struct BlobGranuleRangesWorkload : TestWorkload {
ASSERT(blobRanges.size() == 1);
ASSERT(blobRanges[0] == activeRange);
bool unblobbifyFail1 = wait(self->setRange(cx, range, false, self->tenantName));
bool unblobbifyFail1 = wait(cx->unblobbifyRange(range, self->tenantName));
ASSERT(!unblobbifyFail1);
bool unblobbifyFail2 =
wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), false, self->tenantName));
wait(cx->unblobbifyRange(KeyRangeRef(range.begin, activeRange.end), self->tenantName));
ASSERT(!unblobbifyFail2);
bool unblobbifyFail3 =
wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), false, self->tenantName));
wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, range.end), self->tenantName));
ASSERT(!unblobbifyFail3);
bool unblobbifyFail4 =
wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false, self->tenantName));
wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName));
ASSERT(!unblobbifyFail4);
bool unblobbifyFail5 =
wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false, self->tenantName));
bool unblobbifyFail5 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName));
ASSERT(!unblobbifyFail5);
bool unblobbifyFail6 =
wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false, self->tenantName));
wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName));
ASSERT(!unblobbifyFail6);
bool unblobbifyFail7 =
wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false, self->tenantName));
bool unblobbifyFail7 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName));
ASSERT(!unblobbifyFail7);
bool unblobbifyFail8 =
wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), false, self->tenantName));
bool unblobbifyFail8 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, middleKey2), self->tenantName));
ASSERT(!unblobbifyFail8);
bool unblobbifySuccess = wait(self->setRange(cx, activeRange, true, self->tenantName));
bool unblobbifySuccess = wait(cx->unblobbifyRange(activeRange, self->tenantName));
ASSERT(unblobbifySuccess);
bool unblobbifySuccessAgain = wait(self->setRange(cx, activeRange, true, self->tenantName));
bool unblobbifySuccessAgain = wait(cx->unblobbifyRange(activeRange, self->tenantName));
ASSERT(unblobbifySuccessAgain);
}
@ -642,7 +629,7 @@ struct BlobGranuleRangesWorkload : TestWorkload {
}
ACTOR Future<Void> reBlobbifyUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) {
bool setSuccess = wait(self->setRange(cx, range, true, self->tenantName));
bool setSuccess = wait(cx->blobbifyRange(range, self->tenantName));
ASSERT(setSuccess);
wait(self->checkRange(cx, self, range, true));
@ -651,11 +638,11 @@ struct BlobGranuleRangesWorkload : TestWorkload {
wait(cx->waitPurgeGranulesComplete(purgeKey));
wait(self->checkRange(cx, self, range, false));
bool unsetSuccess = wait(self->setRange(cx, range, false, self->tenantName));
bool unsetSuccess = wait(cx->unblobbifyRange(range, self->tenantName));
ASSERT(unsetSuccess);
wait(self->checkRange(cx, self, range, false));
bool reSetSuccess = wait(self->setRange(cx, range, true, self->tenantName));
bool reSetSuccess = wait(cx->blobbifyRange(range, self->tenantName));
ASSERT(reSetSuccess);
wait(self->checkRange(cx, self, range, true));

View File

@ -305,6 +305,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
state Version prevPurgeVersion = -1;
state UID dbgId = debugRandom()->randomUniqueID();
state Version newPurgeVersion = 0;
// usually we want randomness to verify maximum data, but sometimes hotspotting a subset is good too
state bool pickGranuleUniform = deterministicRandom()->random01() < 0.1;
TraceEvent("BlobGranuleVerifierStart");
if (BGV_DEBUG) {
@ -458,7 +460,13 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
}
// pick a random range
int rIndex = deterministicRandom()->randomInt(0, self->granuleRanges.get().size());
size_t granuleCount = self->granuleRanges.get().size();
size_t rIndex;
if (pickGranuleUniform) {
rIndex = deterministicRandom()->randomInt(0, granuleCount);
} else {
rIndex = deterministicRandom()->randomSkewedUInt32(0, granuleCount);
}
state KeyRange range = self->granuleRanges.get()[rIndex];
state std::pair<RangeResult, Version> fdb = wait(readFromFDB(cx, range));

View File

@ -30,7 +30,7 @@
#include "flow/IRateControl.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/StorageMetrics.actor.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbserver/TSSMappingUtil.actor.h"
@ -394,6 +394,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
state Standalone<VectorRef<KeyValueRef>>
serverList; // "\xff/serverList/[[serverID]]" := "[[StorageServerInterface]]"
state Standalone<VectorRef<KeyValueRef>> serverTag; // "\xff/serverTag/[[serverID]]" = "[[Tag]]"
state bool testResult = true;
std::vector<Future<bool>> cacheResultsPromise;
cacheResultsPromise.push_back(self->fetchKeyValuesFromSS(cx, self, storageCacheKeys, cacheKeyPromise, true));
@ -581,7 +582,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
for (j = 0; j < keyValueFutures.size(); j++) {
ErrorOr<GetKeyValuesReply> rangeResult = keyValueFutures[j].get();
// if (rangeResult.isError()) {
// throw rangeResult.getError();
// throw rangeResult.getError();
// }
// Compare the results with other storage servers
@ -709,7 +710,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
.detail("MatchingKVPairs", matchingKVPairs);
self->testFailure("Data inconsistent", true);
return false;
testResult = false;
}
}
}
@ -755,7 +756,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
.detail("BytesRead", bytesReadInRange);
}
}
return true;
return testResult;
}
// Directly fetch key/values from storage servers through GetKeyValuesRequest

View File

@ -20,6 +20,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbrpc/simulator.h"
@ -150,6 +151,7 @@ struct IncrementalBackupWorkload : TestWorkload {
if (self->submitOnly) {
TraceEvent("IBackupSubmitAttempt").log();
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
try {
wait(self->backupAgent.submitBackup(cx,
self->backupDir,
@ -158,7 +160,8 @@ struct IncrementalBackupWorkload : TestWorkload {
1e8,
self->tag.toString(),
backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
StopWhenDone::False,
UsePartitionedLog::False,
IncrementalBackupOnly::True));
@ -227,19 +230,56 @@ struct IncrementalBackupWorkload : TestWorkload {
.detail("Size", containers.size())
.detail("First", containers.front());
state Key backupURL = Key(containers.front());
state Standalone<VectorRef<KeyRangeRef>> restoreRange;
state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
for (auto r : backupRanges) {
if (!SERVER_KNOBS->ENABLE_ENCRYPTION || !r.intersects(getSystemBackupRanges())) {
restoreRange.push_back_deep(restoreRange.arena(), r);
} else {
KeyRangeRef normalKeyRange = r & normalKeys;
KeyRangeRef systemKeyRange = r & systemKeys;
if (!normalKeyRange.empty()) {
restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
}
if (!systemKeyRange.empty()) {
systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
}
}
}
if (!systemRestoreRange.empty()) {
TraceEvent("IBackupSystemRestoreAttempt").detail("BeginVersion", beginVersion);
wait(success(self->backupAgent.restore(cx,
cx,
"system_restore"_sr,
backupURL,
{},
systemRestoreRange,
WaitForComplete::True,
invalidVersion,
Verbose::True,
Key(),
Key(),
LockDB::True,
UnlockDB::True,
OnlyApplyMutationLogs::True,
InconsistentSnapshotOnly::False,
beginVersion)));
}
TraceEvent("IBackupRestoreAttempt").detail("BeginVersion", beginVersion);
wait(success(self->backupAgent.restore(cx,
cx,
Key(self->tag.toString()),
backupURL,
{},
backupRanges,
restoreRange,
WaitForComplete::True,
invalidVersion,
Verbose::True,
Key(),
Key(),
LockDB::True,
UnlockDB::True,
OnlyApplyMutationLogs::True,
InconsistentSnapshotOnly::False,
beginVersion)));

View File

@ -24,6 +24,7 @@
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -113,14 +114,43 @@ struct RestoreBackupWorkload : TestWorkload {
wait(delay(self->delayFor));
wait(waitOnBackup(self, cx));
wait(clearDatabase(cx));
wait(success(self->backupAgent.restore(cx,
cx,
self->tag,
Key(self->backupContainer->getURL()),
self->backupContainer->getProxy(),
WaitForComplete::True,
::invalidVersion,
Verbose::True)));
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
// restore system keys
VectorRef<KeyRangeRef> systemBackupRanges = getSystemBackupRanges();
state std::vector<Future<Version>> restores;
for (int i = 0; i < systemBackupRanges.size(); i++) {
restores.push_back((self->backupAgent.restore(cx,
cx,
"system_restore"_sr,
Key(self->backupContainer->getURL()),
self->backupContainer->getProxy(),
WaitForComplete::True,
::invalidVersion,
Verbose::True,
systemBackupRanges[i])));
}
waitForAll(restores);
// restore non-system keys
wait(success(self->backupAgent.restore(cx,
cx,
self->tag,
Key(self->backupContainer->getURL()),
self->backupContainer->getProxy(),
WaitForComplete::True,
::invalidVersion,
Verbose::True,
normalKeys)));
} else {
wait(success(self->backupAgent.restore(cx,
cx,
self->tag,
Key(self->backupContainer->getURL()),
self->backupContainer->getProxy(),
WaitForComplete::True,
::invalidVersion,
Verbose::True)));
}
return Void();
}

View File

@ -18,9 +18,11 @@
* limitations under the License.
*/
#include "fdbclient/SystemData.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/BlobStoreWorkload.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
@ -52,13 +54,22 @@ struct RestoreFromBlobWorkload : TestWorkload {
ACTOR static Future<Void> _start(Database cx, RestoreFromBlobWorkload* self) {
state FileBackupAgent backupAgent;
state Standalone<VectorRef<KeyRangeRef>> restoreRanges;
addDefaultBackupRanges(restoreRanges);
wait(delay(self->restoreAfter));
Version v = wait(
backupAgent.restore(cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete));
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
// restore system keys followed by user keys
wait(success(backupAgent.restore(
cx, {}, self->backupTag, self->backupURL, {}, getSystemBackupRanges(), self->waitForComplete)));
Standalone<VectorRef<KeyRangeRef>> restoreRanges;
restoreRanges.push_back_deep(restoreRanges.arena(), normalKeys);
wait(success(backupAgent.restore(
cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete)));
} else {
Standalone<VectorRef<KeyRangeRef>> restoreRanges;
addDefaultBackupRanges(restoreRanges);
wait(success(backupAgent.restore(
cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete)));
}
return Void();
}

View File

@ -38,17 +38,17 @@ struct StorageQuotaWorkload : TestWorkload {
wait(setStorageQuotaHelper(cx, "name2"_sr, 200));
wait(setStorageQuotaHelper(cx, "name1"_sr, 300));
state Optional<uint64_t> quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr));
state Optional<int64_t> quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr));
ASSERT(quota1.present() && quota1.get() == 300);
state Optional<uint64_t> quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr));
state Optional<int64_t> quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr));
ASSERT(quota2.present() && quota2.get() == 200);
state Optional<uint64_t> quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr));
state Optional<int64_t> quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr));
ASSERT(!quota3.present());
return Void();
}
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, StringRef tenantName, uint64_t quota) {
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, StringRef tenantName, int64_t quota) {
state Transaction tr(cx);
loop {
try {
@ -61,11 +61,11 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<Optional<uint64_t>> getStorageQuotaHelper(Database cx, StringRef tenantName) {
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, StringRef tenantName) {
state Transaction tr(cx);
loop {
try {
state Optional<uint64_t> quota = wait(getStorageQuota(&tr, tenantName));
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantName));
wait(tr.commit());
return quota;
} catch (Error& e) {

View File

@ -19,6 +19,7 @@
*/
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
@ -52,8 +53,9 @@ struct SubmitBackupWorkload : TestWorkload {
ACTOR static Future<Void> _start(SubmitBackupWorkload* self, Database cx) {
wait(delay(self->delayFor));
Standalone<VectorRef<KeyRangeRef>> backupRanges;
state Standalone<VectorRef<KeyRangeRef>> backupRanges;
addDefaultBackupRanges(backupRanges);
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
try {
wait(self->backupAgent.submitBackup(cx,
self->backupDir,
@ -62,7 +64,8 @@ struct SubmitBackupWorkload : TestWorkload {
self->snapshotInterval,
self->tag.toString(),
backupRanges,
SERVER_KNOBS->ENABLE_ENCRYPTION,
SERVER_KNOBS->ENABLE_ENCRYPTION &&
configuration.tenantMode != TenantMode::OPTIONAL_TENANT,
self->stopWhenDone,
UsePartitionedLog::False,
self->incremental));

View File

@ -43,7 +43,7 @@ struct TagThrottleApiWorkload : TestWorkload {
}
Future<Void> start(Database const& cx) override {
if (this->clientId != 0)
if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING || this->clientId != 0)
return Void();
return timeout(runThrottleApi(this, cx), testDuration, Void());
}

View File

@ -1,5 +1,5 @@
/*
* GlobalTagThrottling.actor.cpp
* ThroughputQuota.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
@ -23,42 +23,46 @@
#include "flow/actorcompiler.h" // This must be the last #include.
class GlobalTagThrottlingWorkload : public TestWorkload {
// This workload sets the throughput quota of a tag during the setup phase
class ThroughputQuotaWorkload : public TestWorkload {
TransactionTag transactionTag;
double reservedQuota{ 0.0 };
double totalQuota{ 0.0 };
ACTOR static Future<Void> setup(GlobalTagThrottlingWorkload* self, Database cx) {
ACTOR static Future<Void> setup(ThroughputQuotaWorkload* self, Database cx) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
TraceEvent("GlobalTagThrottlingWorkload_SettingTagQuota")
.detail("Tag", self->transactionTag)
TraceEvent("ThroughputQuotaWorkload_SettingTagQuota")
.detail("Tag", printable(self->transactionTag))
.detail("ReservedQuota", self->reservedQuota)
.detail("TotalQuota", self->totalQuota);
ThrottleApi::setTagQuota(tr, self->transactionTag, self->reservedQuota, self->totalQuota);
wait(tr->commit());
return Void();
} catch (Error& e) {
TraceEvent("GlobalTagThrottlingWorkload_SetupError").error(e);
TraceEvent("ThroughputQuotaWorkload_SetupError").error(e);
wait(tr->onError(e));
}
};
}
public:
static constexpr auto NAME = "GlobalTagThrottling";
explicit GlobalTagThrottlingWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
static constexpr auto NAME = "ThroughputQuota";
explicit ThroughputQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
transactionTag = getOption(options, "transactionTag"_sr, "sampleTag"_sr);
reservedQuota = getOption(options, "reservedQuota"_sr, 0.0);
totalQuota = getOption(options, "totalQuota"_sr, 0.0);
}
Future<Void> setup(Database const& cx) override { return clientId ? Void() : setup(this, cx); }
Future<Void> setup(Database const& cx) override {
DatabaseContext::debugUseTags = true;
return clientId ? Void() : setup(this, cx);
}
Future<Void> start(Database const& cx) override { return Void(); }
Future<bool> check(Database const& cx) override { return true; }
void getMetrics(std::vector<PerfMetric>& m) override {}
};
WorkloadFactory<GlobalTagThrottlingWorkload> GlobalTagThrottlingWorkloadFactory;
WorkloadFactory<ThroughputQuotaWorkload> ThroughputQuotaWorkloadFactory;

View File

@ -131,6 +131,7 @@ ERROR( please_reboot_kv_store, 1219, "Need to reboot the storage engine")
ERROR( incompatible_software_version, 1220, "Current software does not support database format" )
ERROR( audit_storage_failed, 1221, "Validate storage consistency operation failed" )
ERROR( audit_storage_exceeded_request_limit, 1222, "Exceeded the max number of allowed concurrent audit storage requests" )
ERROR( proxy_tag_throttled, 1223, "Exceeded maximum proxy tag throttling duration" )
// 15xx Platform errors
ERROR( platform_error, 1500, "Platform error" )

View File

@ -178,13 +178,13 @@ RUN yum -y install \
rm -rf /var/cache/yum
WORKDIR /tmp
RUN curl -Ls https://amazon-eks.s3.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl -o kubectl && \
echo "08ff68159bbcb844455167abb1d0de75bbfe5ae1b051f81ab060a1988027868a kubectl" > kubectl.txt && \
RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \
echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \
sha256sum --quiet -c kubectl.txt && \
mv kubectl /usr/local/bin/kubectl && \
chmod 755 /usr/local/bin/kubectl && \
curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.2.43.zip -o "awscliv2.zip" && \
echo "9a8b3c4e7f72bbcc55e341dce3af42479f2730c225d6d265ee6f9162cfdebdfd awscliv2.zip" > awscliv2.txt && \
curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.7.34.zip -o "awscliv2.zip" && \
echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88 awscliv2.zip" > awscliv2.txt && \
sha256sum --quiet -c awscliv2.txt && \
unzip -qq awscliv2.zip && \
./aws/install && \

View File

@ -53,13 +53,13 @@ RUN curl -Ls https://github.com/krallin/tini/releases/download/v0.19.0/tini-amd6
mv tini /usr/bin/ && \
rm -rf /tmp/*
RUN curl -Ls https://amazon-eks.s3.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl -o kubectl && \
echo "08ff68159bbcb844455167abb1d0de75bbfe5ae1b051f81ab060a1988027868a kubectl" > kubectl.txt && \
RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \
echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \
sha256sum --quiet -c kubectl.txt && \
mv kubectl /usr/local/bin/kubectl && \
chmod 755 /usr/local/bin/kubectl && \
curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.2.43.zip -o "awscliv2.zip" && \
echo "9a8b3c4e7f72bbcc55e341dce3af42479f2730c225d6d265ee6f9162cfdebdfd awscliv2.zip" > awscliv2.txt && \
curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.7.34.zip -o "awscliv2.zip" && \
echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88 awscliv2.zip" > awscliv2.txt && \
sha256sum --quiet -c awscliv2.txt && \
unzip -qq awscliv2.zip && \
./aws/install && \

View File

@ -1,22 +1,44 @@
#!/usr/bin/env bash
set -Eeuxo pipefail
set -Eeuo pipefail
function logg () {
printf "##### $(date +'%Y-%m-%dT%H:%M:%SZ') # %-56.55s #####\n" "${1}"
}
function error_exit () {
echo "################################################################################"
logg "${0} FAILED"
logg "RUN_ID: ${RUN_ID}"
logg "WORKLOAD: ${WORKLOAD}"
logg "ENVIRONMENT IS:"
env
echo "################################################################################"
}
trap error_exit ERR
namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)
POD_NUM=$(echo $POD_NAME | cut -d - -f3)
KEY="ycsb_load_${POD_NUM}_of_${NUM_PODS}_complete"
CLI=$(ls /var/dynamic-conf/bin/*/fdbcli | head -n1)
echo "WAITING FOR ALL PODS TO COME UP"
while [[ $(kubectl get pods -n ${namespace} -l name=ycsb,run=${RUN_ID} --field-selector=status.phase=Running | grep -cv NAME) -lt ${NUM_PODS} ]]; do
logg "WAITING FOR ${NUM_PODS} PODS TO COME UP IN ${namespace}"
while [[ $(kubectl get pods -n "${namespace}" -l name=ycsb,run="${RUN_ID}" --field-selector=status.phase=Running | grep -cv NAME) -lt ${NUM_PODS} ]]; do
sleep 1
done
echo "ALL PODS ARE UP"
logg "${NUM_PODS} PODS ARE UP IN ${namespace}"
echo "RUNNING YCSB"
./bin/ycsb.sh ${MODE} foundationdb -s -P workloads/${WORKLOAD} ${YCSB_ARGS}
echo "YCSB FINISHED"
logg "RUNNING YCSB ${WORKLOAD}"
set -x
./bin/ycsb.sh "${MODE}" foundationdb -s -P "workloads/${WORKLOAD}" "${YCSB_ARGS}"
set +x
logg "YCSB ${WORKLOAD} FINISHED"
echo "COPYING HISTOGRAMS TO S3"
aws s3 sync --sse aws:kms --exclude "*" --include "histogram.*" /tmp s3://${BUCKET}/ycsb_histograms/${namespace}/${POD_NAME}
echo "COPYING HISTOGRAMS TO S3 FINISHED"
logg "COPYING HISTOGRAMS TO S3"
set -x
aws s3 sync --sse aws:kms --exclude "*" --include "histogram.*" /tmp "s3://${BUCKET}/ycsb_histograms/${namespace}/${POD_NAME}"
set +x
logg "COPYING HISTOGRAMS TO S3 FINISHED"
echo "################################################################################"
logg "COMPLETED ${0}"
logg "RUN_ID: ${RUN_ID}"
logg "WORKLOAD: ${WORKLOAD}"
echo "################################################################################"

View File

@ -226,7 +226,6 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES rare/CycleWithDeadHall.toml)
add_fdb_test(TEST_FILES rare/DataDistributionMetrics.toml)
add_fdb_test(TEST_FILES rare/FuzzTest.toml)
add_fdb_test(TEST_FILES rare/GlobalTagThrottling.toml IGNORE)
add_fdb_test(TEST_FILES rare/HighContentionPrefixAllocator.toml)
add_fdb_test(TEST_FILES rare/InventoryTestHeavyWrites.toml)
add_fdb_test(TEST_FILES rare/LargeApiCorrectness.toml)
@ -240,6 +239,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml)
add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml)
add_fdb_test(TEST_FILES rare/Throttling.toml)
add_fdb_test(TEST_FILES rare/ThroughputQuota.toml)
add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml)
add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml)
add_fdb_test(TEST_FILES rare/WriteTagThrottling.toml)

Some files were not shown because too many files have changed in this diff Show More