Merge branch 'main' of https://github.com/apple/foundationdb into readaware

This commit is contained in:
Xiaoxi Wang 2022-03-03 11:15:18 -08:00
commit a28b0f4361
147 changed files with 3979 additions and 2520 deletions

View File

@ -22,7 +22,7 @@ Contributing to FoundationDB can be in contributions to the code base, sharing y
### Binary downloads
Developers interested in using FoundationDB can get started by downloading and installing a binary package. Please see the [downloads page](https://www.foundationdb.org/download/) for a list of available packages.
Developers interested in using FoundationDB can get started by downloading and installing a binary package. Please see the [downloads page](https://github.com/apple/foundationdb/releases) for a list of available packages.
### Compiling from source
@ -181,4 +181,4 @@ Under Windows, only Visual Studio with ClangCl is supported
1. `mkdir build && cd build`
1. `cmake -G "Visual Studio 16 2019" -A x64 -T ClangCl <PATH_TO_FOUNDATIONDB_SOURCE>`
1. `msbuild /p:Configuration=Release foundationdb.sln`
1. To increase build performance, use `/p:UseMultiToolTask=true` and `/p:CL_MPCount=<NUMBER_OF_PARALLEL_JOBS>`
1. To increase build performance, use `/p:UseMultiToolTask=true` and `/p:CL_MPCount=<NUMBER_OF_PARALLEL_JOBS>`

View File

@ -18,6 +18,8 @@ endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
set(cpu "aarch64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le|powerpc64le)")
set(cpu "ppc64le")
endif()
set(IS_ARM_MAC NO)
@ -49,7 +51,7 @@ endif()
add_dependencies(fdb_c fdb_c_generated fdb_c_options)
add_dependencies(fdbclient fdb_c_options)
add_dependencies(fdbclient_sampling fdb_c_options)
target_link_libraries(fdb_c PUBLIC $<BUILD_INTERFACE:fdbclient>)
target_link_libraries(fdb_c PRIVATE $<BUILD_INTERFACE:fdbclient>)
if(APPLE)
set(symbols ${CMAKE_CURRENT_BINARY_DIR}/fdb_c.symbols)
add_custom_command(OUTPUT ${symbols}
@ -121,9 +123,9 @@ if(NOT WIN32 AND NOT IS_ARM_MAC)
strip_debug_symbols(fdb_c_ryw_benchmark)
strip_debug_symbols(fdb_c_txn_size_test)
endif()
target_link_libraries(fdb_c_performance_test PRIVATE fdb_c)
target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c)
target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c)
target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads)
add_dependencies(fdb_c_setup_tests doctest)
add_dependencies(fdb_c_unit_tests doctest)
@ -134,14 +136,14 @@ if(NOT WIN32 AND NOT IS_ARM_MAC)
target_include_directories(fdb_c_unit_tests_version_510 PUBLIC ${DOCTEST_INCLUDE_DIR})
target_include_directories(disconnected_timeout_unit_tests PUBLIC ${DOCTEST_INCLUDE_DIR})
target_link_libraries(fdb_c_setup_tests PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_unit_tests PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_unit_tests PRIVATE fdb_c Threads::Threads fdbclient)
target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads)
target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow)
target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads)
# do not set RPATH for mako
set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
target_link_libraries(mako PRIVATE fdb_c)
target_link_libraries(mako PRIVATE fdb_c fdbclient)
if(NOT OPEN_FOR_IDE)
# Make sure that fdb_c.h is compatible with c90

View File

@ -59,9 +59,10 @@ def write_windows_asm(asmfile, functions):
def write_unix_asm(asmfile, functions, prefix):
if cpu != "aarch64":
if cpu != "aarch64" and cpu!= "ppc64le":
asmfile.write(".intel_syntax noprefix\n")
i = 0
if os == 'linux' or os == 'freebsd':
asmfile.write("\n.data\n")
for f in functions:
@ -70,8 +71,13 @@ def write_unix_asm(asmfile, functions, prefix):
if os == 'linux' or os == 'freebsd':
asmfile.write("\n.text\n")
for f in functions:
if cpu == "ppc64le":
asmfile.write("\n.LC%d:\n" % (i))
asmfile.write("\t.quad \tfdb_api_ptr_%s\n" % (f))
asmfile.write("\t.align 2\n")
i = i + 1
asmfile.write("\t.global %s\n\t.type %s, @function\n" % (f, f))
i = 0
for f in functions:
asmfile.write("\n.globl %s%s\n" % (prefix, f))
if cpu == 'aarch64' and os == 'osx':
@ -118,6 +124,46 @@ def write_unix_asm(asmfile, functions, prefix):
assert False, '{} not supported for Arm yet'.format(os)
asmfile.write("\tldr x8, [x8]\n")
asmfile.write("\tbr x8\n")
elif cpu == "ppc64le":
asmfile.write("\n.LCF%d:\n" % (i))
asmfile.write("\taddis 2,12,.TOC.-.LCF%d@ha\n" % (i))
asmfile.write("\taddi 2,2,.TOC.-.LCF%d@l\n" % (i))
asmfile.write("\tmflr 0\n")
asmfile.write("\tstd 31, -8(1)\n")
asmfile.write("\tstd 0,16(1)\n")
asmfile.write("\tstdu 1,-192(1)\n")
#asmfile.write("\tstd 2,24(1)\n")
asmfile.write("\taddis 11,2,.LC%d@toc@ha\n" % (i))
asmfile.write("\tld 11,.LC%d@toc@l(11)\n" % (i))
asmfile.write("\tld 12,0(11)\n")
asmfile.write("\tstd 2,24(1)\n")
asmfile.write("\tlwa 11,344(1)\n")
asmfile.write("\tmtctr 12\n")
asmfile.write("\tstd 11,152(1)\n")
asmfile.write("\tlwa 11,352(1)\n")
asmfile.write("\tstd 11,160(1)\n")
asmfile.write("\tlwa 11,336(1)\n")
asmfile.write("\tstd 11,144(1)\n")
asmfile.write("\tlwa 11,328(1)\n")
asmfile.write("\tstd 11,136(1)\n")
asmfile.write("\tlwa 11,320(1)\n")
asmfile.write("\tstd 11,128(1)\n")
asmfile.write("\tlwa 11,312(1)\n")
asmfile.write("\tstd 11,120(1)\n")
asmfile.write("\tlwa 11,304(1)\n")
asmfile.write("\tstd 11,112(1)\n")
asmfile.write("\tld 11,296(1)\n")
asmfile.write("\tstd 11,104(1)\n")
asmfile.write("\tlwa 11,288(1)\n")
asmfile.write("\tstd 11,96(1)\n")
asmfile.write("\tbctrl\n")
asmfile.write("\tld 2,24(1)\n")
asmfile.write("\taddi 1,1,192\n")
asmfile.write("\tld 0,16(1)\n")
asmfile.write("\tld 31, -8(1)\n")
asmfile.write("\tmtlr 0\n")
asmfile.write("\tblr\n")
i = i + 1
else:
asmfile.write(
"\tmov r11, qword ptr [%sfdb_api_ptr_%s@GOTPCREL+rip]\n" % (prefix, f))

View File

@ -1,17 +1,18 @@
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <inttypes.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#if defined(__linux__)
#include <linux/limits.h>
@ -615,7 +616,7 @@ int64_t granule_start_load(const char* filename,
// don't seek if offset == 0
if (offset && fseek(fp, offset, SEEK_SET)) {
// if fseek was non-zero, it failed
fprintf(stderr, "ERROR: BG could not seek to %ld in file %s\n", offset, full_fname);
fprintf(stderr, "ERROR: BG could not seek to %" PRId64 " in file %s\n", offset, full_fname);
fclose(fp);
return -1;
}
@ -625,7 +626,7 @@ int64_t granule_start_load(const char* filename,
fclose(fp);
if (readSize != length) {
fprintf(stderr, "ERROR: BG could not read %ld bytes from file: %s\n", length, full_fname);
fprintf(stderr, "ERROR: BG could not read %" PRId64 " bytes from file: %s\n", length, full_fname);
return -1;
}
@ -636,7 +637,7 @@ int64_t granule_start_load(const char* filename,
uint8_t* granule_get_load(int64_t loadId, void* userContext) {
BGLocalFileContext* context = (BGLocalFileContext*)userContext;
if (context->data_by_id[loadId] == 0) {
fprintf(stderr, "ERROR: BG loadId invalid for get_load: %ld\n", loadId);
fprintf(stderr, "ERROR: BG loadId invalid for get_load: %" PRId64 "\n", loadId);
return 0;
}
return context->data_by_id[loadId];
@ -645,7 +646,7 @@ uint8_t* granule_get_load(int64_t loadId, void* userContext) {
void granule_free_load(int64_t loadId, void* userContext) {
BGLocalFileContext* context = (BGLocalFileContext*)userContext;
if (context->data_by_id[loadId] == 0) {
fprintf(stderr, "ERROR: BG loadId invalid for free_load: %ld\n", loadId);
fprintf(stderr, "ERROR: BG loadId invalid for free_load: %" PRId64 "\n", loadId);
}
free(context->data_by_id[loadId]);
context->data_by_id[loadId] = 0;
@ -1119,7 +1120,7 @@ int run_workload(FDBTransaction* transaction,
if (tracetimer == dotrace) {
fdb_error_t err;
tracetimer = 0;
snprintf(traceid, 32, "makotrace%019ld", total_xacts);
snprintf(traceid, 32, "makotrace%019" PRId64, total_xacts);
fprintf(debugme, "DEBUG: txn tracing %s\n", traceid);
err = fdb_transaction_set_option(transaction,
FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER,
@ -1283,7 +1284,7 @@ void* worker_thread(void* thread_args) {
}
fprintf(debugme,
"DEBUG: worker_id:%d (%d) thread_id:%d (%d) database_index:%lu (tid:%lu)\n",
"DEBUG: worker_id:%d (%d) thread_id:%d (%d) database_index:%lu (tid:%" PRIu64 ")\n",
worker_id,
args->num_processes,
thread_id,
@ -1350,6 +1351,11 @@ void* worker_thread(void* thread_args) {
char str2[1000];
sprintf(str2, "%s%d", TEMP_DATA_STORE, *parent_id);
rc = mkdir(str2, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
if (rc < 0) {
int ec = errno;
fprintf(stderr, "Failed to make directory: %s because %s\n", str2, strerror(ec));
goto failExit;
}
for (op = 0; op < MAX_OP; op++) {
if (args->txnspec.ops[op][OP_COUNT] > 0 || op == OP_COMMIT || op == OP_TRANSACTION) {
FILE* fp;
@ -1357,6 +1363,11 @@ void* worker_thread(void* thread_args) {
strcat(file_name, str2);
get_stats_file_name(file_name, worker_id, thread_id, op);
fp = fopen(file_name, "w");
if (!fp) {
int ec = errno;
fprintf(stderr, "Failed to open file: %s because %s\n", file_name, strerror(ec));
goto failExit;
}
lat_block_t* temp_block = ((thread_args_t*)thread_args)->block[op];
if (is_memory_allocated[op]) {
size = stats->latency_samples[op] / LAT_BLOCK_SIZE;
@ -1376,11 +1387,11 @@ void* worker_thread(void* thread_args) {
fclose(fp);
}
}
__sync_fetch_and_add(stopcount, 1);
}
/* fall through */
failExit:
__sync_fetch_and_add(stopcount, 1);
for (op = 0; op < MAX_OP; op++) {
lat_block_t* curr = ((thread_args_t*)thread_args)->block[op];
lat_block_t* prev = NULL;
@ -2240,9 +2251,9 @@ void print_stats(mako_args_t* args, mako_stats_t* stats, struct timespec* now, s
for (op = 0; op < MAX_OP; op++) {
if (args->txnspec.ops[op][OP_COUNT] > 0) {
uint64_t ops_total_diff = ops_total[op] - ops_total_prev[op];
printf("%" STR(STATS_FIELD_WIDTH) "lu ", ops_total_diff);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", ops_total_diff);
if (fp) {
fprintf(fp, "\"%s\": %lu,", get_ops_name(op), ops_total_diff);
fprintf(fp, "\"%s\": %" PRIu64 ",", get_ops_name(op), ops_total_diff);
}
errors_diff[op] = errors_total[op] - errors_total_prev[op];
print_err = (errors_diff[op] > 0);
@ -2270,7 +2281,7 @@ void print_stats(mako_args_t* args, mako_stats_t* stats, struct timespec* now, s
printf("%" STR(STATS_TITLE_WIDTH) "s ", "Errors");
for (op = 0; op < MAX_OP; op++) {
if (args->txnspec.ops[op][OP_COUNT] > 0) {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", errors_diff[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", errors_diff[op]);
if (fp) {
fprintf(fp, ",\"errors\": %.2f", conflicts_diff);
}
@ -2419,10 +2430,10 @@ void print_report(mako_args_t* args,
break;
}
}
printf("Total Xacts: %8lu\n", totalxacts);
printf("Total Conflicts: %8lu\n", conflicts);
printf("Total Errors: %8lu\n", totalerrors);
printf("Overall TPS: %8lu\n\n", totalxacts * 1000000000 / duration_nsec);
printf("Total Xacts: %8" PRIu64 "\n", totalxacts);
printf("Total Conflicts: %8" PRIu64 "\n", conflicts);
printf("Total Errors: %8" PRIu64 "\n", totalerrors);
printf("Overall TPS: %8" PRIu64 "\n\n", totalxacts * 1000000000 / duration_nsec);
if (fp) {
fprintf(fp, "\"results\": {");
@ -2430,10 +2441,10 @@ void print_report(mako_args_t* args,
fprintf(fp, "\"totalProcesses\": %d,", args->num_processes);
fprintf(fp, "\"totalThreads\": %d,", args->num_threads);
fprintf(fp, "\"targetTPS\": %d,", args->tpsmax);
fprintf(fp, "\"totalXacts\": %lu,", totalxacts);
fprintf(fp, "\"totalConflicts\": %lu,", conflicts);
fprintf(fp, "\"totalErrors\": %lu,", totalerrors);
fprintf(fp, "\"overallTPS\": %lu,", totalxacts * 1000000000 / duration_nsec);
fprintf(fp, "\"totalXacts\": %" PRIu64 ",", totalxacts);
fprintf(fp, "\"totalConflicts\": %" PRIu64 ",", conflicts);
fprintf(fp, "\"totalErrors\": %" PRIu64 ",", totalerrors);
fprintf(fp, "\"overallTPS\": %" PRIu64 ",", totalxacts * 1000000000 / duration_nsec);
}
/* per-op stats */
@ -2446,14 +2457,14 @@ void print_report(mako_args_t* args,
}
for (op = 0; op < MAX_OP; op++) {
if ((args->txnspec.ops[op][OP_COUNT] > 0 && op != OP_TRANSACTION) || op == OP_COMMIT) {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", ops_total[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", ops_total[op]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), ops_total[op]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), ops_total[op]);
}
}
}
@ -2475,14 +2486,14 @@ void print_report(mako_args_t* args,
first_op = 1;
for (op = 0; op < MAX_OP; op++) {
if (args->txnspec.ops[op][OP_COUNT] > 0 && op != OP_TRANSACTION) {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", errors_total[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", errors_total[op]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), errors_total[op]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), errors_total[op]);
}
}
}
@ -2500,7 +2511,7 @@ void print_report(mako_args_t* args,
for (op = 0; op < MAX_OP; op++) {
if (args->txnspec.ops[op][OP_COUNT] > 0 || op == OP_TRANSACTION || op == OP_COMMIT) {
if (lat_total[op]) {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_samples[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_samples[op]);
} else {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
}
@ -2510,7 +2521,7 @@ void print_report(mako_args_t* args,
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_samples[op]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_samples[op]);
}
}
}
@ -2527,14 +2538,14 @@ void print_report(mako_args_t* args,
if (lat_min[op] == -1) {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
} else {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_min[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_min[op]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_min[op]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_min[op]);
}
}
}
@ -2550,14 +2561,14 @@ void print_report(mako_args_t* args,
for (op = 0; op < MAX_OP; op++) {
if (args->txnspec.ops[op][OP_COUNT] > 0 || op == OP_TRANSACTION || op == OP_COMMIT) {
if (lat_total[op]) {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_total[op] / lat_samples[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_total[op] / lat_samples[op]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_total[op] / lat_samples[op]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_total[op] / lat_samples[op]);
}
} else {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2577,14 +2588,14 @@ void print_report(mako_args_t* args,
if (lat_max[op] == 0) {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
} else {
printf("%" STR(STATS_FIELD_WIDTH) "lu ", lat_max[op]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", lat_max[op]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), lat_max[op]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), lat_max[op]);
}
}
}
@ -2635,14 +2646,14 @@ void print_report(mako_args_t* args,
} else {
median = (dataPoints[op][num_points[op] / 2] + dataPoints[op][num_points[op] / 2 - 1]) >> 1;
}
printf("%" STR(STATS_FIELD_WIDTH) "lu ", median);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", median);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), median);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), median);
}
} else {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2665,14 +2676,14 @@ void print_report(mako_args_t* args,
}
if (lat_total[op]) {
point_95pct = ((float)(num_points[op]) * 0.95) - 1;
printf("%" STR(STATS_FIELD_WIDTH) "lu ", dataPoints[op][point_95pct]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", dataPoints[op][point_95pct]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), dataPoints[op][point_95pct]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), dataPoints[op][point_95pct]);
}
} else {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2695,14 +2706,14 @@ void print_report(mako_args_t* args,
}
if (lat_total[op]) {
point_99pct = ((float)(num_points[op]) * 0.99) - 1;
printf("%" STR(STATS_FIELD_WIDTH) "lu ", dataPoints[op][point_99pct]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", dataPoints[op][point_99pct]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), dataPoints[op][point_99pct]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), dataPoints[op][point_99pct]);
}
} else {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");
@ -2725,14 +2736,14 @@ void print_report(mako_args_t* args,
}
if (lat_total[op]) {
point_99_9pct = ((float)(num_points[op]) * 0.999) - 1;
printf("%" STR(STATS_FIELD_WIDTH) "lu ", dataPoints[op][point_99_9pct]);
printf("%" STR(STATS_FIELD_WIDTH) PRIu64 " ", dataPoints[op][point_99_9pct]);
if (fp) {
if (first_op) {
first_op = 0;
} else {
fprintf(fp, ",");
}
fprintf(fp, "\"%s\": %lu", get_ops_name(op), dataPoints[op][point_99_9pct]);
fprintf(fp, "\"%s\": %" PRIu64, get_ops_name(op), dataPoints[op][point_99_9pct]);
}
} else {
printf("%" STR(STATS_FIELD_WIDTH) "s ", "N/A");

View File

@ -67,25 +67,25 @@ void runTests(struct ResultSet* rs) {
fdb_transaction_set(tr, keys[i], KEY_SIZE, valueStr, VALUE_SIZE);
e = getSize(rs, tr, sizes + i);
checkError(e, "transaction get size", rs);
printf("size %d: %ld\n", i, sizes[i]);
printf("size %d: %" PRId64 "\n", i, sizes[i]);
i++;
fdb_transaction_set(tr, keys[i], KEY_SIZE, valueStr, VALUE_SIZE);
e = getSize(rs, tr, sizes + i);
checkError(e, "transaction get size", rs);
printf("size %d: %ld\n", i, sizes[i]);
printf("size %d: %" PRId64 "\n", i, sizes[i]);
i++;
fdb_transaction_clear(tr, keys[i], KEY_SIZE);
e = getSize(rs, tr, sizes + i);
checkError(e, "transaction get size", rs);
printf("size %d: %ld\n", i, sizes[i]);
printf("size %d: %" PRId64 "\n", i, sizes[i]);
i++;
fdb_transaction_clear_range(tr, keys[i], KEY_SIZE, keys[i + 1], KEY_SIZE);
e = getSize(rs, tr, sizes + i);
checkError(e, "transaction get size", rs);
printf("size %d: %ld\n", i, sizes[i]);
printf("size %d: %" PRId64 "\n", i, sizes[i]);
i++;
for (j = 0; j + 1 < i; j++) {

View File

@ -18,6 +18,7 @@ set(SRCS
add_flow_target(STATIC_LIBRARY NAME fdb_flow SRCS ${SRCS})
target_link_libraries(fdb_flow PUBLIC fdb_c)
target_link_libraries(fdb_flow PUBLIC fdbclient)
target_include_directories(fdb_flow PUBLIC
"${CMAKE_CURRENT_BINARY_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}"

View File

@ -154,6 +154,7 @@ endif()
set_target_properties(java_workloads PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb")
target_link_libraries(java_workloads PUBLIC fdb_c ${JNI_LIBRARIES})
target_link_libraries(java_workloads PRIVATE flow) # mostly for boost
target_include_directories(java_workloads PUBLIC ${JNI_INCLUDE_DIRS})
set(CMAKE_JAVA_COMPILE_FLAGS "-source" "1.8" "-target" "1.8" "-XDignore.symbol.file")
@ -228,6 +229,8 @@ if(NOT OPEN_FOR_IDE)
else()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(lib_destination "linux/aarch64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
set(lib_destination "linux/ppc64le")
else()
set(lib_destination "linux/amd64")
endif()

View File

@ -182,7 +182,7 @@ public class JNIUtil {
private static OS getRunningOS() {
String osname = System.getProperty("os.name").toLowerCase();
String arch = System.getProperty("os.arch");
if (!arch.equals("amd64") && !arch.equals("x86_64") && !arch.equals("aarch64")) {
if (!arch.equals("amd64") && !arch.equals("x86_64") && !arch.equals("aarch64") && !arch.equals("ppc64le")) {
throw new IllegalStateException("Unknown or unsupported arch: " + arch);
}
if (osname.startsWith("windows")) {

View File

@ -219,7 +219,7 @@ else()
endif()
if(STATIC_LINK_LIBCXX)
if (NOT USE_LIBCXX AND NOT APPLE)
add_link_options(-static-libstdc++ -static-libgcc)
add_link_options(-static-libstdc++ -static-libgcc)
endif()
endif()
# # Instruction sets we require to be supported by the CPU
@ -309,7 +309,7 @@ else()
if (PROFILE_INSTR_GENERATE)
message(FATAL_ERROR "Can't set both PROFILE_INSTR_GENERATE and PROFILE_INSTR_USE")
endif()
add_compile_options(-Wno-error=profile-instr-out-of-date)
add_compile_options(-Wno-error=profile-instr-out-of-date -Wno-error=profile-instr-unprofiled)
add_compile_options(-fprofile-instr-use=${PROFILE_INSTR_USE})
add_link_options(-fprofile-instr-use=${PROFILE_INSTR_USE})
endif()
@ -349,6 +349,9 @@ else()
add_compile_options(-march=armv8.2-a+crc+simd)
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
add_compile_options(-m64 -mcpu=power9 -mtune=power9 -DNO_WARN_X86_INTRINSICS)
endif()
# Check whether we can use dtrace probes
include(CheckSymbolExists)
check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE)

View File

@ -303,7 +303,9 @@ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION
"/etc/rc.d/init.d"
"/usr/lib/pkgconfig"
"/usr/lib/foundationdb"
"/usr/lib/cmake")
"/usr/lib/cmake"
"/usr/lib/foundationdb-${FDB_VERSION}/etc/foundationdb"
)
set(CPACK_RPM_DEBUGINFO_PACKAGE ${GENERATE_DEBUG_PACKAGES})
#set(CPACK_RPM_BUILD_SOURCE_FDB_INSTALL_DIRS_PREFIX /usr/src)
set(CPACK_RPM_COMPONENT_INSTALL ON)

View File

@ -1,5 +1,6 @@
add_subdirectory(fmt-8.0.1)
if(NOT WIN32)
add_subdirectory(debug_determinism)
add_subdirectory(monitoring)
add_subdirectory(TraceLogHelper)
add_subdirectory(TestHarness)

View File

@ -0,0 +1,5 @@
add_library(debug_determinism STATIC debug_determinism.cpp)
# So that we can link to libfdb_c.so. Not strictly necessary but convenient for use with our
# TRACE_PC_GUARD_INSTRUMENTATION_LIB cmake option
target_compile_options(debug_determinism PRIVATE -fPIC)

View File

@ -0,0 +1,45 @@
Utilities for debugging unseed mismatches for foundationdb simulation tests.
99/100 times the source of the nondeterminism is use of uninitialized memory and
what you want to do is build with `-DUSE_VALGRIND=ON` and run simulations under
valgrind.
Common sources of nondeterminism and specialized tools to find them.
1. Use of uninitialized memory (use valgrind!)
1. Memory errors (use valgrind and/or asan)
1. Undefined behavior (use ubsan. You can also try _GLIBCXX_DEBUG)
If it's not any of these then now it's time to try this technique. Look for
1. Call to some kind of "get current time" function that's not in `INetwork`
1. Depending on the relative ordering of allocated memory. E.g. Using heap-allocated pointers as keys in a `std::map`.
1. Inspecting something about the current state of the system (e.g. free disk space)
1. Depending on iteration order of an unordered map
# Quickstart
Set these cmake flags
```
-DTRACE_PC_GUARD_INSTRUMENTATION_LIB=$BUILDDIR/lib/libdebug_determinism.a
```
and change `#define DEBUG_DETERMINISM 0` to `#define DEBUG_DETERMINISM 1` in
flow/Platform.h. This disables several known sources of nondeterminism that
don't affect unseeds.
For reasons I don't fully understand, it appears that sqlite exhibits some
nondeterminism if you don't add `#define SQLITE_OMIT_LOOKASIDE` to the top of
fdbserver/sqlite/sqlite3.amalgamation.c, so you probably want to do that too.
Now when you run an fdbserver simulation, it will write a file `out.bin` in the
current directory which contains the sequence of edges in the control flow graph
that were encountered during the simulation. If you rename `out.bin` to `in.bin`
and then re-run, the simulation will validate that the sequence of edges is the
same as the last run. If it's not, then the simulation will enter an infinite
loop at the first difference and print a message. Then you probably want to
attach gdb to the process and investigate from there.
You'll need to make sure you delete the `simfdb` folder before each run, because
otherwise you'll take a different codepath for deleting the `simfdb` folder at
the beginning of simulation.

View File

@ -0,0 +1,52 @@
#include <stdint.h>
#include <stdio.h>
namespace {
FILE* out = nullptr;
FILE* in = nullptr;
void loop_forever() {
// Try to convince the optimizer not to optimize away this loop
static volatile uint64_t x = 0;
for (;;) {
++x;
}
}
} // namespace
// This callback is inserted by the compiler as a module constructor
// into every DSO. 'start' and 'stop' correspond to the
// beginning and end of the section with the guards for the entire
// binary (executable or DSO). The callback will be called at least
// once per DSO and may be called multiple times with the same parameters.
extern "C" void __sanitizer_cov_trace_pc_guard_init(uint32_t* start, uint32_t* stop) {
in = fopen("in.bin", "r");
out = fopen("out.bin", "w");
static uint64_t N; // Counter for the guards.
if (start == stop || *start)
return; // Initialize only once.
for (uint32_t* x = start; x < stop; x++) {
*x = ++N; // Guards should start from 1.
}
}
// This callback is inserted by the compiler on every edge in the
// control flow (some optimizations apply).
// Typically, the compiler will emit the code like this:
// if(*guard)
// __sanitizer_cov_trace_pc_guard(guard);
// But for large functions it will emit a simple call:
// __sanitizer_cov_trace_pc_guard(guard);
extern "C" void __sanitizer_cov_trace_pc_guard(uint32_t* guard) {
if (!guard) {
return;
}
fwrite(guard, 1, sizeof(*guard), out);
if (in) {
uint32_t theirs;
fread(&theirs, 1, sizeof(theirs), in);
if (*guard != theirs) {
printf("Non-determinism detected\n");
loop_forever();
}
}
}

37
contrib/generate_profile.sh Executable file
View File

@ -0,0 +1,37 @@
#!/bin/bash
if [ $# -eq 0 ] || [ $# -gt 2 ]
then
echo "Usage: generate_profile.sh Path_Of_Foundation_Build_Directory Storage_Engine"
exit 1
fi
fdbdir=$1
storage_engine='ssd'
if [ $# -eq 2 ]
then
storage_engine=$2
fi
export LD_LIBRARY_PATH=$fdbdir/lib:$LD_LIBRARY_PATH
export FDB_CLUSTER_FILE=$fdbdir/fdb.cluster
export LLVM_PROFILE_FILE=$fdbdir/sandbox/fdb-%p.profraw
$fdbdir/bin/fdbmonitor --conffile $fdbdir/sandbox/foundationdb.conf --lockfile $fdbdir/sandbox/fdbmonitor.pid &
# This profile will be ignored
export LLVM_PROFILE_FILE=$fdbdir/sandbox/cli-%m.profraw
$fdbdir/bin/fdbcli -C $fdbdir/fdb.cluster --exec "configure new $storage_engine single"
export LLVM_PROFILE_FILE=$fdbdir/sandbox/mako-build-%m.profraw
$fdbdir/bin/mako -p 64 -t 1 --keylen 32 --vallen 16 --mode build --rows 10000 --trace --trace_format json
export LLVM_PROFILE_FILE=$fdbdir/sandbox/mako-run-%m.profraw
$fdbdir/bin/mako -p 1 -t 2 --keylen 32 --vallen 16 --mode run --rows 10000 --transaction grvg7i2gr1:48cr1:48 --seconds 60 --trace $fdbdir/sandbox/logs --trace_format json
# Shutdown fdbserver to trigger profile dumping
fdbmonitor_pid=$(cat $fdbdir/sandbox/fdbmonitor.pid)
fdbserver_pid=$(cat /proc/$fdbmonitor_pid/task/$fdbmonitor_pid/children)
gdb --batch --eval-command 'call (void)exit(0)' --pid $fdbserver_pid
# Clean up
kill -9 $fdbmonitor_pid
# Profile for server
llvm-profdata merge -output=$fdbdir/fdb.profdata $fdbdir/sandbox/fdb-*.profraw
# Profile for client
llvm-profdata merge -output=$fdbdir/mako.profdata $fdbdir/sandbox/mako-*.profraw

View File

@ -147,7 +147,7 @@ def centos_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]:
container = None
image = None
try:
container = Container("centos", initd=True)
container = Container("centos:7", initd=True)
for rpm in rpms:
container.copy_to(rpm, "/opt")
container.run(["bash", "-c", "yum update -y"])
@ -237,10 +237,6 @@ def test_write(linux_container: Container, snapshot):
assert snapshot == linux_container.run(["fdbcli", "--exec", "get x"])
def test_fdbcli_help_text(linux_container: Container, snapshot):
assert snapshot == linux_container.run(["fdbcli", "--help"])
def test_execstack_permissions_libfdb_c(linux_container: Container, snapshot):
linux_container.run(["ldconfig"])
assert snapshot == linux_container.run(

View File

@ -2,8 +2,16 @@
Release Notes
#############
6.3.24
======
* Fixed a bug where get key location can overload proxies. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_
* Added a mechanism that can reduce the number of empty peek reply by not always returning empty peek reply immediately. `(PR #6413) <https://github.com/apple/foundationdb/pull/6413>`_
* Enable TLS support for Windows. `(PR #6193) <https://github.com/apple/foundationdb/pull/6193>`_
* Fixed a bug where a shard gets merged too soon. `(PR #6115) <https://github.com/apple/foundationdb/pull/6115>`_
6.3.23
======
* Add AWS v4 header support for backup. `(PR #6025) <https://github.com/apple/foundationdb/pull/6025>`_
* Fixed a bug that remoteDCIsHealthy logic is not guarded by CC_ENABLE_WORKER_HEALTH_MONITOR, which may prevent HA failback. `(PR #6106) <https://github.com/apple/foundationdb/pull/6106>`_
* Fixed a race condition with updating the coordinated state and updating the master registration. `(PR #6088) <https://github.com/apple/foundationdb/pull/6088>`_
* Changed dbinfo broadcast to be explicitly requested by the worker registration message. `(PR #6073) <https://github.com/apple/foundationdb/pull/6073>`_

View File

@ -19,6 +19,7 @@
* limitations under the License.
*/
#include "contrib/fmt-8.0.1/include/fmt/format.h"
#include "flow/flow.h"
#include "flow/Platform.h"
#include "flow/DeterministicRandom.h"
@ -413,7 +414,7 @@ ACTOR Future<Void> logThroughput(int64_t* v, Key* next) {
loop {
state int64_t last = *v;
wait(delay(1));
printf("throughput: %ld bytes/s, next: %s\n", *v - last, printable(*next).c_str());
fmt::print("throughput: {} bytes/s, next: {}\n", *v - last, printable(*next).c_str());
}
}

View File

@ -1690,7 +1690,7 @@ ACTOR Future<Void> cleanupStatus(Reference<ReadYourWritesTransaction> tr,
readMore = true;
} catch (Error& e) {
// If doc can't be parsed or isn't alive, delete it.
TraceEvent(SevWarn, "RemovedDeadBackupLayerStatus").detail("Key", docs[i].key).error(e, true);
TraceEvent(SevWarn, "RemovedDeadBackupLayerStatus").errorUnsuppressed(e).detail("Key", docs[i].key);
tr->clear(docs[i].key);
// If limit is 1 then read more.
if (limit == 1)
@ -2754,7 +2754,7 @@ ACTOR Future<Void> queryBackup(const char* name,
reportBackupQueryError(operationId,
result,
errorMessage =
format("the specified restorable version %ld is not valid", restoreVersion));
format("the specified restorable version %lld is not valid", restoreVersion));
return Void();
}
Optional<RestorableFileSet> fileSet = wait(bc->getRestoreSet(restoreVersion, keyRangesFilter));
@ -3081,7 +3081,7 @@ static void addKeyRange(std::string optionValue, Standalone<VectorRef<KeyRangeRe
// Too many keys
default:
fprintf(stderr, "ERROR: Invalid key range identified with %ld keys", tokens.size());
fmt::print(stderr, "ERROR: Invalid key range identified with {} keys", tokens.size());
throw invalid_option_value();
break;
}
@ -3887,9 +3887,9 @@ int main(int argc, char* argv[]) {
} else {
fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", knobName.c_str(), e.what());
TraceEvent(SevError, "FailedToSetKnob")
.error(e)
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString))
.error(e);
.detail("Value", printable(knobValueString));
throw;
}
}

View File

@ -19,7 +19,7 @@
*/
#include "boost/lexical_cast.hpp"
#include "contrib/fmt-8.0.1/include/fmt/format.h"
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/IClientApi.h"
@ -40,7 +40,7 @@ ACTOR Future<bool> advanceVersionCommandActor(Reference<IDatabase> db, std::vect
} else {
state Version v;
int n = 0;
if (sscanf(tokens[1].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[1].size()) {
if (sscanf(tokens[1].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[1].size()) {
printUsage(tokens[0]);
return false;
} else {
@ -53,7 +53,7 @@ ACTOR Future<bool> advanceVersionCommandActor(Reference<IDatabase> db, std::vect
tr->set(advanceVersionSpecialKey, boost::lexical_cast<std::string>(v));
wait(safeThreadFutureToFuture(tr->commit()));
} else {
printf("Current read version is %ld\n", rv);
fmt::print("Current read version is {}\n", rv);
return true;
}
} catch (Error& e) {

View File

@ -115,7 +115,7 @@ ACTOR Future<bool> changeFeedCommandActor(Database localDb, std::vector<StringRe
Version end = std::numeric_limits<Version>::max();
if (tokens.size() > 3) {
int n = 0;
if (sscanf(tokens[3].toString().c_str(), "%ld%n", &begin, &n) != 1 || n != tokens[3].size()) {
if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &begin, &n) != 1 || n != tokens[3].size()) {
printUsage(tokens[0]);
return false;
}
@ -168,7 +168,7 @@ ACTOR Future<bool> changeFeedCommandActor(Database localDb, std::vector<StringRe
}
Version v;
int n = 0;
if (sscanf(tokens[3].toString().c_str(), "%ld%n", &v, &n) != 1 || n != tokens[3].size()) {
if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[3].size()) {
printUsage(tokens[0]);
return false;
} else {

View File

@ -176,7 +176,7 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
case ConfigurationResult::STORAGE_MIGRATION_DISABLED:
fprintf(stderr,
"ERROR: Storage engine type cannot be changed because "
"storage_migration_mode=disabled.\n");
"storage_migration_type=disabled.\n");
fprintf(stderr,
"Type `configure perpetual_storage_wiggle=1 storage_migration_type=gradual' to enable gradual "
"migration with the perpetual wiggle, or `configure "

View File

@ -65,13 +65,14 @@ ACTOR Future<bool> changeCoordinators(Reference<IDatabase> db, std::vector<Strin
state StringRef new_cluster_description;
state std::string auto_coordinators_str;
StringRef nameTokenBegin = LiteralStringRef("description=");
for (auto tok = tokens.begin() + 1; tok != tokens.end(); ++tok)
for (auto tok = tokens.begin() + 1; tok != tokens.end(); ++tok) {
if (tok->startsWith(nameTokenBegin)) {
new_cluster_description = tok->substr(nameTokenBegin.size());
std::copy(tok + 1, tokens.end(), tok);
tokens.resize(tokens.size() - 1);
break;
}
}
state bool automatic = tokens.size() == 2 && tokens[1] == LiteralStringRef("auto");
state Reference<ITransaction> tr = db->createTransaction();
@ -96,17 +97,32 @@ ACTOR Future<bool> changeCoordinators(Reference<IDatabase> db, std::vector<Strin
tr->set(fdb_cli::coordinatorsProcessSpecialKey, auto_coordinators_str);
} else if (tokens.size() > 1) {
state std::set<NetworkAddress> new_coordinators_addresses;
state std::vector<std::string> newAddresslist;
state std::set<Hostname> new_coordinators_hostnames;
state std::vector<std::string> newCoordinatorslist;
state std::vector<StringRef>::iterator t;
for (t = tokens.begin() + 1; t != tokens.end(); ++t) {
try {
auto const& addr = NetworkAddress::parse(t->toString());
if (new_coordinators_addresses.count(addr)) {
fprintf(stderr, "ERROR: passed redundant coordinators: `%s'\n", addr.toString().c_str());
return true;
if (Hostname::isHostname(t->toString())) {
// We do not resolve hostnames here. We commit them as is.
const auto& hostname = Hostname::parse(t->toString());
if (new_coordinators_hostnames.count(hostname)) {
fprintf(stderr,
"ERROR: passed redundant coordinators: `%s'\n",
hostname.toString().c_str());
return true;
}
new_coordinators_hostnames.insert(hostname);
newCoordinatorslist.push_back(hostname.toString());
} else {
const auto& addr = NetworkAddress::parse(t->toString());
if (new_coordinators_addresses.count(addr)) {
fprintf(
stderr, "ERROR: passed redundant coordinators: `%s'\n", addr.toString().c_str());
return true;
}
new_coordinators_addresses.insert(addr);
newCoordinatorslist.push_back(addr.toString());
}
new_coordinators_addresses.insert(addr);
newAddresslist.push_back(addr.toString());
} catch (Error& e) {
if (e.code() == error_code_connection_string_invalid) {
fprintf(
@ -116,12 +132,12 @@ ACTOR Future<bool> changeCoordinators(Reference<IDatabase> db, std::vector<Strin
throw;
}
}
std::string new_addresses_str = boost::algorithm::join(newAddresslist, ", ");
tr->set(fdb_cli::coordinatorsProcessSpecialKey, new_addresses_str);
std::string new_coordinators_str = boost::algorithm::join(newCoordinatorslist, ", ");
tr->set(fdb_cli::coordinatorsProcessSpecialKey, new_coordinators_str);
}
wait(safeThreadFutureToFuture(tr->commit()));
// commit should always fail here
// if coordinators are changed, we should get commit_unknown() error
// If the commit succeeds, the coordinators change and the commit will fail with commit_unknown_result().
ASSERT(false);
} catch (Error& e) {
state Error err(e);

View File

@ -59,7 +59,7 @@ ACTOR Future<Void> includeLocalities(Reference<IDatabase> db,
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
TraceEvent("IncludeLocalitiesError").error(e, true);
TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e);
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
@ -99,7 +99,7 @@ ACTOR Future<Void> includeServers(Reference<IDatabase> db, std::vector<AddressEx
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
TraceEvent("IncludeServersError").error(e, true);
TraceEvent("IncludeServersError").errorUnsuppressed(e);
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}

View File

@ -705,12 +705,12 @@ void printStatus(StatusObjectReader statusObj,
}
}
outputString += format(
" %s log epoch: %ld begin: %ld end: %s, missing "
" %s log epoch: %lld begin: %lld end: %s, missing "
"log interfaces(id,address): %s\n",
current ? "Current" : "Old",
epoch,
beginVersion,
endVersion == invalidVersion ? "(unknown)" : format("%ld", endVersion).c_str(),
endVersion == invalidVersion ? "(unknown)" : format("%lld", endVersion).c_str(),
missing_log_interfaces.c_str());
}
}

View File

@ -1014,9 +1014,9 @@ struct CLIOptions {
} else {
fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", knobName.c_str(), e.what());
TraceEvent(SevError, "FailedToSetKnob")
.error(e)
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString))
.error(e);
.detail("Value", printable(knobValueString));
exit_code = FDB_EXIT_ERROR;
}
}
@ -1157,7 +1157,6 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
state bool writeMode = false;
state std::string clusterConnectString;
state std::map<Key, std::pair<Value, ClientLeaderRegInterface>> address_interface;
state FdbOptions globalOptions;
@ -1171,6 +1170,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
ClusterConnectionFile::lookupClusterFileName(opt.clusterFile);
try {
ccf = makeReference<ClusterConnectionFile>(resolvedClusterFile.first);
wait(ccf->resolveHostnames());
} catch (Error& e) {
fprintf(stderr, "%s\n", ClusterConnectionFile::getErrorString(resolvedClusterFile, e).c_str());
return 1;
@ -1615,7 +1615,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
} else {
Version v = wait(makeInterruptable(
safeThreadFutureToFuture(getTransaction(db, tr, options, intrans)->getReadVersion())));
printf("%ld\n", v);
fmt::print("{}\n", v);
}
continue;
}

View File

@ -28,6 +28,7 @@
#include "fdbclient/CoordinationInterface.h"
// Determine public IP address by calling the first coordinator.
IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
try {
using namespace boost::asio;
@ -35,6 +36,7 @@ IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs) {
io_service ioService;
ip::udp::socket socket(ioService);
ccs.resolveHostnamesBlocking();
const auto& coordAddr = ccs.coordinators()[0];
const auto boostIp = coordAddr.ip.isV6() ? ip::address(ip::address_v6(coordAddr.ip.toV6()))
: ip::address(ip::address_v4(coordAddr.ip.toV4()));

View File

@ -305,9 +305,9 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
throw;
TraceEvent m(SevWarn, "BackupContainer");
m.error(e);
m.detail("Description", "Invalid container specification. See help.");
m.detail("URL", url);
m.error(e);
if (e.code() == error_code_backup_invalid_url)
m.detail("LastOpenError", lastOpenError);
@ -360,10 +360,9 @@ ACTOR Future<std::vector<std::string>> listContainers_impl(std::string baseURL)
throw;
TraceEvent m(SevWarn, "BackupContainer");
m.error(e);
m.detail("Description", "Invalid backup container URL prefix. See help.");
m.detail("URL", baseURL);
m.error(e);
if (e.code() == error_code_backup_invalid_url)
m.detail("LastOpenError", IBackupContainer::lastOpenError);

View File

@ -1149,8 +1149,8 @@ public:
keyFile = _keyFile;
} catch (Error& e) {
TraceEvent(SevWarnAlways, "FailedToOpenEncryptionKeyFile")
.detail("FileName", encryptionKeyFileName)
.error(e);
.error(e)
.detail("FileName", encryptionKeyFileName);
throw e;
}
int bytesRead = wait(keyFile->read(cipherKey->data(), cipherKey->size(), 0));
@ -1377,8 +1377,8 @@ ACTOR static Future<KeyRange> getSnapshotFileKeyRange_impl(Reference<BackupConta
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "BackupContainerGetSnapshotFileKeyRangeConnectionFailure")
.detail("Retries", ++readFileRetries)
.error(e);
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "BackupContainerGetSnapshotFileKeyRangeUnexpectedError").error(e);
@ -1549,9 +1549,9 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
throw;
TraceEvent m(SevWarn, "BackupContainer");
m.error(e);
m.detail("Description", "Invalid container specification. See help.");
m.detail("URL", url);
m.error(e);
if (e.code() == error_code_backup_invalid_url)
m.detail("LastOpenError", lastOpenError);

View File

@ -86,6 +86,8 @@ void ClientKnobs::initialize(Randomize randomize) {
init( LOCATION_CACHE_EVICTION_SIZE, 600000 );
init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3;
init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 );
init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 );
init( GET_RANGE_SHARD_LIMIT, 2 );
init( WARM_RANGE_SHARD_LIMIT, 100 );

View File

@ -86,6 +86,8 @@ public:
// When locationCache in DatabaseContext gets to be this size, items will be evicted
int LOCATION_CACHE_EVICTION_SIZE;
int LOCATION_CACHE_EVICTION_SIZE_SIM;
double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD;
double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL;
int GET_RANGE_SHARD_LIMIT;
int WARM_RANGE_SHARD_LIMIT;

View File

@ -27,7 +27,7 @@ ConfigKey ConfigKeyRef::decodeKey(KeyRef const& key) {
try {
tuple = Tuple::unpack(key);
} catch (Error& e) {
TraceEvent(SevWarnAlways, "FailedToUnpackConfigKey").detail("Key", printable(key)).error(e);
TraceEvent(SevWarnAlways, "FailedToUnpackConfigKey").error(e).detail("Key", printable(key));
throw invalid_config_db_key();
}
if (tuple.size() != 2) {
@ -96,7 +96,7 @@ public:
struct ToStringFunc {
std::string operator()(int v) const { return format("int:%d", v); }
std::string operator()(int64_t v) const { return format("int64_t:%ld", v); }
std::string operator()(int64_t v) const { return format("int64_t:%lld", v); }
std::string operator()(bool v) const { return format("bool:%d", v); }
std::string operator()(ValueRef v) const { return "string:" + v.toString(); }
std::string operator()(double v) const { return format("double:%lf", v); }

View File

@ -58,13 +58,28 @@ struct ClientLeaderRegInterface {
// - There is no address present more than once
class ClusterConnectionString {
public:
enum ConnectionStringStatus { RESOLVED, RESOLVING, UNRESOLVED };
ClusterConnectionString() {}
ClusterConnectionString(const std::string& connStr);
ClusterConnectionString(const std::vector<NetworkAddress>& coordinators, Key key);
ClusterConnectionString(const std::vector<Hostname>& hosts, Key key);
ClusterConnectionString(const ClusterConnectionString& rhs) { operator=(rhs); }
ClusterConnectionString& operator=(const ClusterConnectionString& rhs) {
// Copy everything except AsyncTrigger resolveFinish.
status = rhs.status;
coords = rhs.coords;
hostnames = rhs.hostnames;
networkAddressToHostname = rhs.networkAddressToHostname;
key = rhs.key;
keyDesc = rhs.keyDesc;
connectionString = rhs.connectionString;
return *this;
}
std::vector<NetworkAddress> const& coordinators() const { return coords; }
void addResolved(Hostname hostname, NetworkAddress address) {
void addResolved(const Hostname& hostname, const NetworkAddress& address) {
coords.push_back(address);
networkAddressToHostname.emplace(address, hostname);
}
@ -78,16 +93,20 @@ public:
// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
// should be preferred.
void resolveHostnamesBlocking();
void resetToUnresolved();
// This function derives the member connectionString from the current key, coordinators and hostnames.
void resetConnectionString();
bool hasUnresolvedHostnames = false;
void resetToUnresolved();
void parseKey(const std::string& key);
ConnectionStringStatus status = RESOLVED;
AsyncTrigger resolveFinish;
std::vector<NetworkAddress> coords;
std::vector<Hostname> hostnames;
std::unordered_map<NetworkAddress, Hostname> networkAddressToHostname;
private:
void parseConnString();
void parseKey(const std::string& key);
std::unordered_map<NetworkAddress, Hostname> networkAddressToHostname;
Key key, keyDesc;
std::string connectionString;
};
@ -139,7 +158,7 @@ public:
// Signals to the connection record that it was successfully used to connect to a cluster.
void notifyConnected();
bool hasUnresolvedHostnames() const;
ClusterConnectionString::ConnectionStringStatus connectionStringStatus() const;
Future<Void> resolveHostnames();
// This one should only be used when resolving asynchronously is impossible. For all other cases, resolveHostnames()
// should be preferred.

View File

@ -2142,7 +2142,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
wait(tr->commit());
break;
} catch (Error& e) {
TraceEvent("SetDestUidOrBeginVersionError").error(e, true);
TraceEvent("SetDestUidOrBeginVersionError").errorUnsuppressed(e);
wait(tr->onError(e));
}
}
@ -2907,7 +2907,7 @@ public:
TraceEvent("DBA_Abort").detail("CommitVersion", tr->getCommittedVersion());
break;
} catch (Error& e) {
TraceEvent("DBA_AbortError").error(e, true);
TraceEvent("DBA_AbortError").errorUnsuppressed(e);
wait(tr->onError(e));
}
}

View File

@ -198,6 +198,11 @@ struct ChangeFeedData : ReferenceCounted<ChangeFeedData> {
ChangeFeedData() : notAtLatest(1) {}
};
struct EndpointFailureInfo {
double startTime = 0;
double lastRefreshTime = 0;
};
class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
public:
static DatabaseContext* allocateOnForeignThread() {
@ -241,6 +246,14 @@ public:
void invalidateCache(const KeyRef&, Reverse isBackward = Reverse::False);
void invalidateCache(const KeyRangeRef&);
// Records that `endpoint` is failed on a healthy server.
void setFailedEndpointOnHealthyServer(const Endpoint& endpoint);
// Updates `endpoint` refresh time if the `endpoint` is a failed endpoint. If not, this does nothing.
void updateFailedEndpointRefreshTime(const Endpoint& endpoint);
Optional<EndpointFailureInfo> getEndpointFailureInfo(const Endpoint& endpoint);
void clearFailedEndpointOnHealthyServer(const Endpoint& endpoint);
bool sampleReadTags() const;
bool sampleOnCost(uint64_t cost) const;
@ -394,6 +407,7 @@ public:
// Cache of location information
int locationCacheSize;
CoalescedKeyRangeMap<Reference<LocationInfo>> locationCache;
std::unordered_map<Endpoint, EndpointFailureInfo> failedEndpointsOnHealthyServersInfo;
std::map<UID, StorageServerInfo*> server_interf;
std::map<UID, BlobWorkerInterface> blobWorker_interf; // blob workers don't change endpoints for the same ID

View File

@ -1204,10 +1204,12 @@ struct ReadBlobGranuleContext {
struct StorageMetadataType {
constexpr static FileIdentifier file_identifier = 732123;
// when the SS is initialized
uint64_t createdTime; // comes from Platform::timer_int()
uint64_t createdTime; // comes from currentTime()
StorageMetadataType() : createdTime(0) {}
StorageMetadataType(uint64_t t) : createdTime(t) {}
static uint64_t currentTime() { return g_network->timer() * 1e9; }
// To change this serialization, ProtocolVersion::StorageMetadata must be updated, and downgrades need
// to be considered
template <class Ar>

View File

@ -87,7 +87,7 @@ std::string secondsToTimeFormat(int64_t seconds) {
else if (seconds >= 60)
return format("%.2f minute(s)", seconds / 60.0);
else
return format("%ld second(s)", seconds);
return format("%lld second(s)", seconds);
}
const Key FileBackupAgent::keyLastRestorable = LiteralStringRef("last_restorable");
@ -4407,9 +4407,9 @@ public:
break;
} catch (Error& e) {
TraceEvent(numTries > 50 ? SevError : SevInfo, "FastRestoreToolSubmitRestoreRequestsMayFail")
.error(e)
.detail("Reason", "DB is not properly locked")
.detail("ExpectedLockID", randomUID)
.error(e);
.detail("ExpectedLockID", randomUID);
numTries++;
wait(tr->onError(e));
}
@ -4443,8 +4443,8 @@ public:
break;
} catch (Error& e) {
TraceEvent(numTries > 50 ? SevError : SevInfo, "FastRestoreToolSubmitRestoreRequestsRetry")
.detail("RestoreIndex", restoreIndex)
.error(e);
.error(e)
.detail("RestoreIndex", restoreIndex);
numTries++;
wait(tr->onError(e));
}
@ -5183,7 +5183,7 @@ public:
else
statusText += "The initial snapshot is still running.\n";
statusText += format("\nDetails:\n LogBytes written - %ld\n RangeBytes written - %ld\n "
statusText += format("\nDetails:\n LogBytes written - %lld\n RangeBytes written - %lld\n "
"Last complete log version and timestamp - %s, %s\n "
"Last complete snapshot version and timestamp - %s, %s\n "
"Current Snapshot start version and timestamp - %s, %s\n "
@ -5800,9 +5800,9 @@ ACTOR static Future<Void> transformDatabaseContents(Database cx,
break;
} catch (Error& e) {
TraceEvent("FastRestoreWorkloadTransformDatabaseContentsGetAllKeys")
.error(e)
.detail("Index", i)
.detail("RestoreRange", restoreRanges[i])
.error(e);
.detail("RestoreRange", restoreRanges[i]);
oldData = Standalone<VectorRef<KeyValueRef>>(); // clear the vector
wait(tr.onError(e));
}

View File

@ -448,6 +448,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
err = http_bad_request_id();
TraceEvent(SevError, "HTTPRequestFailedIDMismatch")
.error(err.get())
.detail("DebugID", conn->getDebugID())
.detail("RemoteAddress", conn->getPeerAddress())
.detail("Verb", verb)
@ -456,8 +457,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
.detail("ResponseCode", r->code)
.detail("ResponseContentLen", r->contentLen)
.detail("RequestIDSent", requestID)
.detail("RequestIDReceived", responseID)
.error(err.get());
.detail("RequestIDReceived", responseID);
}
}
@ -501,7 +501,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest(Reference<IConnection> conn,
contentLen,
total_sent);
}
event.error(e);
event.errorUnsuppressed(e);
throw;
}
}

View File

@ -169,7 +169,7 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
} else if (value == "gradual") {
type = StorageMigrationType::GRADUAL;
} else {
printf("Error: Only disabled|aggressive|gradual are valid for storage_migration_mode.\n");
printf("Error: Only disabled|aggressive|gradual are valid for storage_migration_type.\n");
return out;
}
out[p + key] = format("%d", type);
@ -772,7 +772,7 @@ ACTOR Future<std::vector<NetworkAddress>> getCoordinators(Database cx) {
ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
Reference<IQuorumChange> change,
std::vector<NetworkAddress>* desiredCoordinators) {
ClusterConnectionString* conn) {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
@ -783,44 +783,47 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
return CoordinatorsResult::BAD_DATABASE_STATE; // Someone deleted this key entirely?
state ClusterConnectionString old(currentKey.get().toString());
wait(old.resolveHostnames());
if (tr->getDatabase()->getConnectionRecord() &&
old.clusterKeyName().toString() !=
tr->getDatabase()->getConnectionRecord()->getConnectionString().clusterKeyName())
return CoordinatorsResult::BAD_DATABASE_STATE; // Someone changed the "name" of the database??
state CoordinatorsResult result = CoordinatorsResult::SUCCESS;
if (!desiredCoordinators->size()) {
std::vector<NetworkAddress> _desiredCoordinators = wait(change->getDesiredCoordinators(
if (!conn->coords.size()) {
std::vector<NetworkAddress> desiredCoordinatorAddresses = wait(change->getDesiredCoordinators(
tr,
old.coordinators(),
Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(old)),
result));
*desiredCoordinators = _desiredCoordinators;
conn->coords = desiredCoordinatorAddresses;
}
if (result != CoordinatorsResult::SUCCESS)
return result;
if (!desiredCoordinators->size())
if (!conn->coordinators().size())
return CoordinatorsResult::INVALID_NETWORK_ADDRESSES;
std::sort(desiredCoordinators->begin(), desiredCoordinators->end());
std::sort(conn->coords.begin(), conn->coords.end());
std::sort(conn->hostnames.begin(), conn->hostnames.end());
std::string newName = change->getDesiredClusterKeyName();
if (newName.empty())
newName = old.clusterKeyName().toString();
if (old.coordinators() == *desiredCoordinators && old.clusterKeyName() == newName)
if (old.coordinators() == conn->coordinators() && old.clusterKeyName() == newName)
return CoordinatorsResult::SAME_NETWORK_ADDRESSES;
state ClusterConnectionString conn(*desiredCoordinators,
StringRef(newName + ':' + deterministicRandom()->randomAlphaNumeric(32)));
std::string key(newName + ':' + deterministicRandom()->randomAlphaNumeric(32));
conn->parseKey(key);
conn->resetConnectionString();
if (g_network->isSimulated()) {
int i = 0;
int protectedCount = 0;
while ((protectedCount < ((desiredCoordinators->size() / 2) + 1)) && (i < desiredCoordinators->size())) {
auto process = g_simulator.getProcessByAddress((*desiredCoordinators)[i]);
while ((protectedCount < ((conn->coordinators().size() / 2) + 1)) && (i < conn->coordinators().size())) {
auto process = g_simulator.getProcessByAddress(conn->coordinators()[i]);
auto addresses = process->addresses;
if (!process->isReliable()) {
@ -832,14 +835,14 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
if (addresses.secondaryAddress.present()) {
g_simulator.protectedAddresses.insert(process->addresses.secondaryAddress.get());
}
TraceEvent("ProtectCoordinator").detail("Address", (*desiredCoordinators)[i]).backtrace();
TraceEvent("ProtectCoordinator").detail("Address", conn->coordinators()[i]).backtrace();
protectedCount++;
i++;
}
}
std::vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(conn)));
ClientCoordinators coord(Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(*conn)));
leaderServers.reserve(coord.clientLeaderServers.size());
for (int i = 0; i < coord.clientLeaderServers.size(); i++)
@ -851,7 +854,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
when(wait(waitForAll(leaderServers))) {}
when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; }
}
tr->set(coordinatorsKey, conn.toString());
tr->set(coordinatorsKey, conn->toString());
return Optional<CoordinatorsResult>();
}
@ -1273,7 +1276,7 @@ ACTOR Future<Void> excludeServers(Database cx, std::vector<AddressExclusion> ser
wait(ryw.commit());
return Void();
} catch (Error& e) {
TraceEvent("ExcludeServersError").error(e, true);
TraceEvent("ExcludeServersError").errorUnsuppressed(e);
wait(ryw.onError(e));
}
}
@ -1285,7 +1288,7 @@ ACTOR Future<Void> excludeServers(Database cx, std::vector<AddressExclusion> ser
wait(tr.commit());
return Void();
} catch (Error& e) {
TraceEvent("ExcludeServersError").error(e, true);
TraceEvent("ExcludeServersError").errorUnsuppressed(e);
wait(tr.onError(e));
}
}
@ -1336,7 +1339,7 @@ ACTOR Future<Void> excludeLocalities(Database cx, std::unordered_set<std::string
wait(ryw.commit());
return Void();
} catch (Error& e) {
TraceEvent("ExcludeLocalitiesError").error(e, true);
TraceEvent("ExcludeLocalitiesError").errorUnsuppressed(e);
wait(ryw.onError(e));
}
}
@ -1348,7 +1351,7 @@ ACTOR Future<Void> excludeLocalities(Database cx, std::unordered_set<std::string
wait(tr.commit());
return Void();
} catch (Error& e) {
TraceEvent("ExcludeLocalitiesError").error(e, true);
TraceEvent("ExcludeLocalitiesError").errorUnsuppressed(e);
wait(tr.onError(e));
}
}
@ -1392,7 +1395,7 @@ ACTOR Future<Void> includeServers(Database cx, std::vector<AddressExclusion> ser
wait(ryw.commit());
return Void();
} catch (Error& e) {
TraceEvent("IncludeServersError").error(e, true);
TraceEvent("IncludeServersError").errorUnsuppressed(e);
wait(ryw.onError(e));
}
}
@ -1449,7 +1452,7 @@ ACTOR Future<Void> includeServers(Database cx, std::vector<AddressExclusion> ser
wait(tr.commit());
return Void();
} catch (Error& e) {
TraceEvent("IncludeServersError").error(e, true);
TraceEvent("IncludeServersError").errorUnsuppressed(e);
wait(tr.onError(e));
}
}
@ -1487,7 +1490,7 @@ ACTOR Future<Void> includeLocalities(Database cx, std::vector<std::string> local
wait(ryw.commit());
return Void();
} catch (Error& e) {
TraceEvent("IncludeLocalitiesError").error(e, true);
TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e);
wait(ryw.onError(e));
}
}
@ -1535,7 +1538,7 @@ ACTOR Future<Void> includeLocalities(Database cx, std::vector<std::string> local
wait(tr.commit());
return Void();
} catch (Error& e) {
TraceEvent("IncludeLocalitiesError").error(e, true);
TraceEvent("IncludeLocalitiesError").errorUnsuppressed(e);
wait(tr.onError(e));
}
}
@ -1907,7 +1910,7 @@ ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UI
TraceEvent("SnapCreateSucceeded").detail("snapUID", snapUID);
return Void();
} catch (Error& e) {
TraceEvent(SevWarn, "SnapCreateFailed").detail("snapUID", snapUID).error(e);
TraceEvent(SevWarn, "SnapCreateFailed").error(e).detail("snapUID", snapUID);
throw;
}
}
@ -2198,7 +2201,7 @@ ACTOR Future<Void> advanceVersion(Database cx, Version v) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned()));
wait(tr.commit());
} else {
printf("Current read version is %ld\n", rv);
fmt::print("Current read version is {}\n", rv);
return Void();
}
} catch (Error& e) {

View File

@ -56,7 +56,7 @@ struct IQuorumChange : ReferenceCounted<IQuorumChange> {
// Change to use the given set of coordination servers
ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
Reference<IQuorumChange> change,
std::vector<NetworkAddress>* desiredCoordinators);
ClusterConnectionString* conn);
ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChange> change);
Reference<IQuorumChange> autoQuorumChange(int desired = -1);
Reference<IQuorumChange> noQuorumChange();

View File

@ -77,8 +77,8 @@ void IClusterConnectionRecord::setPersisted() {
connectionStringNeedsPersisted = false;
}
bool IClusterConnectionRecord::hasUnresolvedHostnames() const {
return cs.hasUnresolvedHostnames;
ClusterConnectionString::ConnectionStringStatus IClusterConnectionRecord::connectionStringStatus() const {
return cs.status;
}
Future<Void> IClusterConnectionRecord::resolveHostnames() {
@ -98,39 +98,56 @@ std::string ClusterConnectionString::getErrorString(std::string const& source, E
}
ACTOR Future<Void> resolveHostnamesImpl(ClusterConnectionString* self) {
std::vector<Future<Void>> fs;
for (auto const& hostName : self->hostnames) {
fs.push_back(map(INetworkConnections::net()->resolveTCPEndpoint(hostName.host, hostName.service),
[=](std::vector<NetworkAddress> const& addresses) -> Void {
NetworkAddress addr = addresses[deterministicRandom()->randomInt(0, addresses.size())];
addr.flags = 0; // Reset the parsed address to public
addr.fromHostname = NetworkAddressFromHostname::True;
if (hostName.isTLS) {
addr.flags |= NetworkAddress::FLAG_TLS;
}
self->addResolved(hostName, addr);
return Void();
}));
loop {
if (self->status == ClusterConnectionString::UNRESOLVED) {
self->status = ClusterConnectionString::RESOLVING;
std::vector<Future<Void>> fs;
for (auto const& hostname : self->hostnames) {
fs.push_back(map(INetworkConnections::net()->resolveTCPEndpoint(hostname.host, hostname.service),
[=](std::vector<NetworkAddress> const& addresses) -> Void {
NetworkAddress address =
addresses[deterministicRandom()->randomInt(0, addresses.size())];
address.flags = 0; // Reset the parsed address to public
address.fromHostname = NetworkAddressFromHostname::True;
if (hostname.isTLS) {
address.flags |= NetworkAddress::FLAG_TLS;
}
self->addResolved(hostname, address);
return Void();
}));
}
wait(waitForAll(fs));
std::sort(self->coords.begin(), self->coords.end());
if (std::unique(self->coords.begin(), self->coords.end()) != self->coords.end()) {
self->status = ClusterConnectionString::UNRESOLVED;
self->resolveFinish.trigger();
throw connection_string_invalid();
}
self->status = ClusterConnectionString::RESOLVED;
self->resolveFinish.trigger();
break;
} else if (self->status == ClusterConnectionString::RESOLVING) {
wait(self->resolveFinish.onTrigger());
if (self->status == ClusterConnectionString::RESOLVED) {
break;
}
// Otherwise, this means other threads failed on resolve, so here we go back to the loop and try to resolve
// again.
} else {
// status is RESOLVED, nothing to do.
break;
}
}
wait(waitForAll(fs));
std::sort(self->coords.begin(), self->coords.end());
if (std::unique(self->coords.begin(), self->coords.end()) != self->coords.end()) {
throw connection_string_invalid();
}
self->hasUnresolvedHostnames = false;
return Void();
}
Future<Void> ClusterConnectionString::resolveHostnames() {
if (!hasUnresolvedHostnames) {
return Void();
} else {
return resolveHostnamesImpl(this);
}
return resolveHostnamesImpl(this);
}
void ClusterConnectionString::resolveHostnamesBlocking() {
if (hasUnresolvedHostnames) {
if (status != RESOLVED) {
status = RESOLVING;
for (auto const& hostname : hostnames) {
std::vector<NetworkAddress> addresses =
INetworkConnections::net()->resolveTCPEndpointBlocking(hostname.host, hostname.service);
@ -140,14 +157,14 @@ void ClusterConnectionString::resolveHostnamesBlocking() {
if (hostname.isTLS) {
address.flags |= NetworkAddress::FLAG_TLS;
}
coords.push_back(address);
networkAddressToHostname.emplace(address, hostname);
addResolved(hostname, address);
}
std::sort(coords.begin(), coords.end());
if (std::unique(coords.begin(), coords.end()) != coords.end()) {
status = UNRESOLVED;
throw connection_string_invalid();
}
hasUnresolvedHostnames = false;
status = RESOLVED;
}
}
@ -156,11 +173,15 @@ void ClusterConnectionString::resetToUnresolved() {
coords.clear();
hostnames.clear();
networkAddressToHostname.clear();
hasUnresolvedHostnames = true;
status = UNRESOLVED;
parseConnString();
}
}
void ClusterConnectionString::resetConnectionString() {
connectionString = toString();
}
void ClusterConnectionString::parseConnString() {
// Split on '@' into key@addrs
int pAt = connectionString.find_first_of('@');
@ -184,7 +205,9 @@ void ClusterConnectionString::parseConnString() {
}
p = pComma + 1;
}
hasUnresolvedHostnames = hostnames.size() > 0;
if (hostnames.size() > 0) {
status = UNRESOLVED;
}
ASSERT((coords.size() + hostnames.size()) > 0);
std::sort(coords.begin(), coords.end());
@ -256,7 +279,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
{
input = "asdf:2345@localhost:1234";
ClusterConnectionString cs(input);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
ASSERT(cs.hostnames.size() == 1);
ASSERT(input == cs.toString());
}
@ -264,7 +287,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
{
input = "0xxdeadbeef:100100100@localhost:34534,host-name:23443";
ClusterConnectionString cs(input);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
ASSERT(cs.hostnames.size() == 2);
ASSERT(input == cs.toString());
}
@ -277,7 +300,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
commented += "# asdfasdf ##";
ClusterConnectionString cs(commented);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
ASSERT(cs.hostnames.size() == 2);
ASSERT(input == cs.toString());
}
@ -290,7 +313,7 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/hostnames") {
commented += "# asdfasdf ##";
ClusterConnectionString cs(commented);
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
ASSERT(cs.hostnames.size() == 2);
ASSERT(input == cs.toString());
}
@ -314,16 +337,16 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString") {
INetworkConnections::net()->addMockTCPEndpoint(hn2, port2, { address2 });
state ClusterConnectionString cs(hostnames, LiteralStringRef("TestCluster:0"));
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
ASSERT(cs.hostnames.size() == 2);
ASSERT(cs.coordinators().size() == 0);
wait(cs.resolveHostnames());
ASSERT(!cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::RESOLVED);
ASSERT(cs.hostnames.size() == 2);
ASSERT(cs.coordinators().size() == 2);
ASSERT(cs.toString() == connectionString);
cs.resetToUnresolved();
ASSERT(cs.hasUnresolvedHostnames);
ASSERT(cs.status == ClusterConnectionString::UNRESOLVED);
ASSERT(cs.hostnames.size() == 2);
ASSERT(cs.coordinators().size() == 0);
ASSERT(cs.toString() == connectionString);
@ -422,29 +445,17 @@ TEST_CASE("/fdbclient/MonitorLeader/parseConnectionString/fuzz") {
}
ClusterConnectionString::ClusterConnectionString(const std::vector<NetworkAddress>& servers, Key key)
: coords(servers) {
: status(RESOLVED), coords(servers) {
std::string keyString = key.toString();
parseKey(keyString);
connectionString = keyString + "@";
for (int i = 0; i < coords.size(); i++) {
if (i) {
connectionString += ',';
}
connectionString += coords[i].toString();
}
resetConnectionString();
}
ClusterConnectionString::ClusterConnectionString(const std::vector<Hostname>& hosts, Key key)
: hasUnresolvedHostnames(true), hostnames(hosts) {
: status(UNRESOLVED), hostnames(hosts) {
std::string keyString = key.toString();
parseKey(keyString);
connectionString = keyString + "@";
for (int i = 0; i < hostnames.size(); i++) {
if (i) {
connectionString += ',';
}
connectionString += hostnames[i].toString();
}
resetConnectionString();
}
void ClusterConnectionString::parseKey(const std::string& key) {
@ -497,6 +508,7 @@ std::string ClusterConnectionString::toString() const {
}
ClientCoordinators::ClientCoordinators(Reference<IClusterConnectionRecord> ccr) : ccr(ccr) {
ASSERT(ccr->connectionStringStatus() == ClusterConnectionString::RESOLVED);
ClusterConnectionString cs = ccr->getConnectionString();
for (auto s = cs.coordinators().begin(); s != cs.coordinators().end(); ++s)
clientLeaderServers.push_back(ClientLeaderRegInterface(*s));
@ -525,15 +537,44 @@ ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {
// Nominee is the worker among all workers that are considered as leader by one coordinator
// This function contacts a coordinator coord to ask who is its nominee.
// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
// to throw `coordinators_changed()` error
ACTOR Future<Void> monitorNominee(Key key,
ClientLeaderRegInterface coord,
AsyncTrigger* nomineeChange,
Optional<LeaderInfo>* info) {
Optional<LeaderInfo>* info,
Optional<Hostname> hostname = Optional<Hostname>()) {
loop {
state Optional<LeaderInfo> li =
wait(retryBrokenPromise(coord.getLeader,
GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
TaskPriority::CoordinationReply));
state Optional<LeaderInfo> li;
if (coord.getLeader.getEndpoint().getPrimaryAddress().fromHostname) {
state ErrorOr<Optional<LeaderInfo>> rep =
wait(coord.getLeader.tryGetReply(GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
TaskPriority::CoordinationReply));
if (rep.isError()) {
// Connecting to nominee failed, most likely due to connection failed.
TraceEvent("MonitorNomineeError")
.error(rep.getError())
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
.detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
if (rep.getError().code() == error_code_request_maybe_delivered) {
// 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(0.05));
throw coordinators_changed();
} else {
throw rep.getError();
}
} else if (rep.present()) {
li = rep.get();
}
} else {
Optional<LeaderInfo> tmp =
wait(retryBrokenPromise(coord.getLeader,
GetLeaderRequest(key, info->present() ? info->get().changeID : UID()),
TaskPriority::CoordinationReply));
li = tmp;
}
wait(Future<Void>(Void())); // Make sure we weren't cancelled
TraceEvent("GetLeaderReply")
@ -608,53 +649,74 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const std::vector<Optional<Leade
ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Value>> outSerializedLeaderInfo,
MonitorLeaderInfo info) {
state ClientCoordinators coordinators(info.intermediateConnRecord);
state AsyncTrigger nomineeChange;
state std::vector<Optional<LeaderInfo>> nominees;
state Future<Void> allActors;
nominees.resize(coordinators.clientLeaderServers.size());
std::vector<Future<Void>> actors;
// Ask all coordinators if the worker is considered as a leader (leader nominee) by the coordinator.
actors.reserve(coordinators.clientLeaderServers.size());
for (int i = 0; i < coordinators.clientLeaderServers.size(); i++)
actors.push_back(
monitorNominee(coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i]));
allActors = waitForAll(actors);
loop {
Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
TraceEvent("MonitorLeaderChange")
.detail("NewLeader", leader.present() ? leader.get().first.changeID : UID(1, 1));
if (leader.present()) {
if (leader.get().first.forward) {
TraceEvent("MonitorLeaderForwarding")
.detail("NewConnStr", leader.get().first.serializedInfo.toString())
.detail("OldConnStr", info.intermediateConnRecord->getConnectionString().toString())
.trackLatest("MonitorLeaderForwarding");
info.intermediateConnRecord = connRecord->makeIntermediateRecord(
ClusterConnectionString(leader.get().first.serializedInfo.toString()));
return info;
}
if (connRecord != info.intermediateConnRecord) {
if (!info.hasConnected) {
TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection")
.detail("ClusterFile", connRecord->toString())
.detail("StoredConnectionString", connRecord->getConnectionString().toString())
.detail("CurrentConnectionString",
info.intermediateConnRecord->getConnectionString().toString());
}
connRecord->setAndPersistConnectionString(info.intermediateConnRecord->getConnectionString());
info.intermediateConnRecord = connRecord;
}
wait(connRecord->resolveHostnames());
wait(info.intermediateConnRecord->resolveHostnames());
state ClientCoordinators coordinators(info.intermediateConnRecord);
state AsyncTrigger nomineeChange;
state std::vector<Optional<LeaderInfo>> nominees;
state Future<Void> allActors;
info.hasConnected = true;
connRecord->notifyConnected();
nominees.resize(coordinators.clientLeaderServers.size());
outSerializedLeaderInfo->set(leader.get().first.serializedInfo);
state std::vector<Future<Void>> actors;
// Ask all coordinators if the worker is considered as a leader (leader nominee) by the coordinator.
actors.reserve(coordinators.clientLeaderServers.size());
for (int i = 0; i < coordinators.clientLeaderServers.size(); i++) {
Optional<Hostname> hostname;
auto r = connRecord->getConnectionString().networkAddressToHostname.find(
coordinators.clientLeaderServers[i].getLeader.getEndpoint().getPrimaryAddress());
if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
hostname = r->second;
}
actors.push_back(monitorNominee(
coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i], hostname));
}
allActors = waitForAll(actors);
loop {
Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
TraceEvent("MonitorLeaderChange")
.detail("NewLeader", leader.present() ? leader.get().first.changeID : UID(1, 1));
if (leader.present()) {
if (leader.get().first.forward) {
TraceEvent("MonitorLeaderForwarding")
.detail("NewConnStr", leader.get().first.serializedInfo.toString())
.detail("OldConnStr", info.intermediateConnRecord->getConnectionString().toString())
.trackLatest("MonitorLeaderForwarding");
info.intermediateConnRecord = connRecord->makeIntermediateRecord(
ClusterConnectionString(leader.get().first.serializedInfo.toString()));
return info;
}
if (connRecord != info.intermediateConnRecord) {
if (!info.hasConnected) {
TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection")
.detail("ClusterFile", connRecord->toString())
.detail("StoredConnectionString", connRecord->getConnectionString().toString())
.detail("CurrentConnectionString",
info.intermediateConnRecord->getConnectionString().toString());
}
connRecord->setAndPersistConnectionString(info.intermediateConnRecord->getConnectionString());
info.intermediateConnRecord = connRecord;
}
info.hasConnected = true;
connRecord->notifyConnected();
outSerializedLeaderInfo->set(leader.get().first.serializedInfo);
}
try {
wait(nomineeChange.onTrigger() || allActors);
} catch (Error& e) {
if (e.code() == error_code_coordinators_changed) {
TraceEvent("MonitorLeaderCoordinatorsChanged").suppressFor(1.0);
connRecord->getConnectionString().resetToUnresolved();
break;
} else {
throw e;
}
}
}
wait(nomineeChange.onTrigger() || allActors);
}
}
@ -774,8 +836,8 @@ ACTOR Future<Void> getClientInfoFromLeader(Reference<AsyncVar<Optional<ClusterCo
when(ClientDBInfo ni =
wait(brokenPromiseToNever(knownLeader->get().get().clientInterface.openDatabase.getReply(req)))) {
TraceEvent("GetClientInfoFromLeaderGotClientInfo", knownLeader->get().get().clientInterface.id())
.detail("CommitProxy0", ni.commitProxies.size() ? ni.commitProxies[0].id() : UID())
.detail("GrvProxy0", ni.grvProxies.size() ? ni.grvProxies[0].id() : UID())
.detail("CommitProxy0", ni.commitProxies.size() ? ni.commitProxies[0].address().toString() : "")
.detail("GrvProxy0", ni.grvProxies.size() ? ni.grvProxies[0].address().toString() : "")
.detail("ClientID", ni.id);
clientData->clientInfo->set(CachedSerialization<ClientDBInfo>(ni));
}
@ -787,7 +849,8 @@ ACTOR Future<Void> getClientInfoFromLeader(Reference<AsyncVar<Optional<ClusterCo
ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
std::vector<NetworkAddress> coordinators,
ClientData* clientData,
Reference<AsyncVar<Optional<LeaderInfo>>> leaderInfo) {
Reference<AsyncVar<Optional<LeaderInfo>>> leaderInfo,
Reference<AsyncVar<Void>> coordinatorsChanged) {
state std::vector<ClientLeaderRegInterface> clientLeaderServers;
state AsyncTrigger nomineeChange;
state std::vector<Optional<LeaderInfo>> nominees;
@ -835,7 +898,14 @@ ACTOR Future<Void> monitorLeaderAndGetClientInfo(Key clusterKey,
leaderInfo->set(leader.get().first);
}
}
wait(nomineeChange.onTrigger() || allActors);
try {
wait(nomineeChange.onTrigger() || allActors);
} catch (Error& e) {
if (e.code() == error_code_coordinators_changed) {
coordinatorsChanged->trigger();
}
throw e;
}
}
}
@ -964,9 +1034,15 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
successIndex = index;
} else {
TEST(rep.getError().code() == error_code_failed_to_progress); // Coordinator cant talk to cluster controller
if (rep.getError().code() == error_code_coordinators_changed) {
throw coordinators_changed();
}
index = (index + 1) % addrs.size();
if (index == successIndex) {
wait(delay(CLIENT_KNOBS->COORDINATOR_RECONNECTION_DELAY));
// When the client fails talking to all coordinators, we throw coordinators_changed() and let the caller
// re-resolve the connection string and retry.
throw coordinators_changed();
}
}
}
@ -978,16 +1054,27 @@ ACTOR Future<Void> monitorProxies(
Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
Key traceLogGroup) {
wait(connRecord->get()->resolveHostnames());
state MonitorLeaderInfo info(connRecord->get());
loop {
choose {
when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
info = _info;
try {
wait(info.intermediateConnRecord->resolveHostnames());
choose {
when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
connRecord->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
info = _info;
}
when(wait(connRecord->onChange())) {
info.hasConnected = false;
info.intermediateConnRecord = connRecord->get();
}
}
when(wait(connRecord->onChange())) {
info.hasConnected = false;
info.intermediateConnRecord = connRecord->get();
} catch (Error& e) {
if (e.code() == error_code_coordinators_changed) {
TraceEvent("MonitorProxiesCoordinatorsChanged").suppressFor(1.0);
info.intermediateConnRecord->getConnectionString().resetToUnresolved();
} else {
throw e;
}
}
}

View File

@ -74,10 +74,11 @@ Future<Void> monitorLeader(Reference<IClusterConnectionRecord> const& connFile,
// This is one place where the leader election algorithm is run. The coodinator contacts all coodinators to collect
// nominees, the nominee with the most nomination is the leader, and collects client data from the leader. This function
// also monitors the change of the leader.
Future<Void> monitorLeaderAndGetClientInfo(Value const& key,
Future<Void> monitorLeaderAndGetClientInfo(Key const& clusterKey,
std::vector<NetworkAddress> const& coordinators,
ClientData* const& clientData,
Reference<AsyncVar<Optional<LeaderInfo>>> const& leaderInfo);
Reference<AsyncVar<Optional<LeaderInfo>>> const& leaderInfo,
Reference<AsyncVar<Void>> const& coordinatorsChanged);
Future<Void> monitorProxies(
Reference<AsyncVar<Reference<IClusterConnectionRecord>>> const& connRecord,

View File

@ -1202,9 +1202,9 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
// but we may not see trace logs from this client until a successful connection
// is established.
TraceEvent(SevWarnAlways, "FailedToInitializeExternalClient")
.error(e)
.detail("LibraryPath", client->libPath)
.detail("ClusterFilePath", clusterFilePath)
.error(e);
.detail("ClusterFilePath", clusterFilePath);
}
}
});
@ -1218,9 +1218,9 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
} catch (Error& e) {
// This connection is discarded
TraceEvent(SevWarnAlways, "FailedToCreateLegacyDatabaseConnection")
.error(e)
.detail("LibraryPath", client->libPath)
.detail("ClusterFilePath", clusterFilePath)
.error(e);
.detail("ClusterFilePath", clusterFilePath);
}
}
});
@ -1360,8 +1360,8 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
}
TraceEvent("ErrorGettingClusterProtocolVersion")
.detail("ExpectedProtocolVersion", expected)
.error(cv.getError());
.error(cv.getError())
.detail("ExpectedProtocolVersion", expected);
}
ProtocolVersion clusterVersion =
@ -1409,10 +1409,10 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
newDb = client->api->createDatabase(clusterFilePath.c_str());
} catch (Error& e) {
TraceEvent(SevWarnAlways, "MultiVersionClientFailedToCreateDatabase")
.error(e)
.detail("LibraryPath", client->libPath)
.detail("External", client->external)
.detail("ClusterFilePath", clusterFilePath)
.error(e);
.detail("ClusterFilePath", clusterFilePath);
// Put the client in a disconnected state until the version changes again
updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
@ -1486,8 +1486,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
// We can't create a new database to monitor the cluster version. This means we will continue using the
// previous one, which should hopefully continue to work.
TraceEvent(SevWarnAlways, "FailedToCreateDatabaseForVersionMonitoring")
.detail("ClusterFilePath", clusterFilePath)
.error(e);
.error(e)
.detail("ClusterFilePath", clusterFilePath);
}
}
} else {
@ -1499,8 +1499,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
// We can't create a new database to monitor the cluster version. This means we will continue using the
// previous one, which should hopefully continue to work.
TraceEvent(SevWarnAlways, "FailedToCreateDatabaseForVersionMonitoring")
.detail("ClusterFilePath", clusterFilePath)
.error(e);
.error(e)
.detail("ClusterFilePath", clusterFilePath);
}
}

View File

@ -732,16 +732,18 @@ Future<Void> attemptGRVFromOldProxies(std::vector<GrvProxyInterface> oldProxies,
ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
Reference<AsyncVar<ClientDBInfo> const> clientDBInfo,
AsyncTrigger* proxyChangeTrigger) {
AsyncTrigger* proxiesChangeTrigger) {
state std::vector<CommitProxyInterface> curCommitProxies;
state std::vector<GrvProxyInterface> curGrvProxies;
state ActorCollection actors(false);
state Future<Void> clientDBInfoOnChange = clientDBInfo->onChange();
curCommitProxies = clientDBInfo->get().commitProxies;
curGrvProxies = clientDBInfo->get().grvProxies;
loop {
choose {
when(wait(clientDBInfo->onChange())) {
when(wait(clientDBInfoOnChange)) {
clientDBInfoOnChange = clientDBInfo->onChange();
if (clientDBInfo->get().commitProxies != curCommitProxies ||
clientDBInfo->get().grvProxies != curGrvProxies) {
// This condition is a bit complicated. Here we want to verify that we're unable to receive a read
@ -758,7 +760,7 @@ ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
}
curCommitProxies = clientDBInfo->get().commitProxies;
curGrvProxies = clientDBInfo->get().grvProxies;
proxyChangeTrigger->trigger();
proxiesChangeTrigger->trigger();
}
}
when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); }
@ -1596,6 +1598,32 @@ void DatabaseContext::invalidateCache(const KeyRangeRef& keys) {
locationCache.insert(KeyRangeRef(begin, end), Reference<LocationInfo>());
}
void DatabaseContext::setFailedEndpointOnHealthyServer(const Endpoint& endpoint) {
if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) {
failedEndpointsOnHealthyServersInfo[endpoint] =
EndpointFailureInfo{ .startTime = now(), .lastRefreshTime = now() };
}
}
void DatabaseContext::updateFailedEndpointRefreshTime(const Endpoint& endpoint) {
if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) {
// The endpoint is not failed. Nothing to update.
return;
}
failedEndpointsOnHealthyServersInfo[endpoint].lastRefreshTime = now();
}
Optional<EndpointFailureInfo> DatabaseContext::getEndpointFailureInfo(const Endpoint& endpoint) {
if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) {
return Optional<EndpointFailureInfo>();
}
return failedEndpointsOnHealthyServersInfo[endpoint];
}
void DatabaseContext::clearFailedEndpointOnHealthyServer(const Endpoint& endpoint) {
failedEndpointsOnHealthyServersInfo.erase(endpoint);
}
Future<Void> DatabaseContext::onProxiesChanged() const {
return this->proxiesChangeTrigger.onTrigger();
}
@ -2449,6 +2477,35 @@ ACTOR Future<std::pair<KeyRange, Reference<LocationInfo>>> getKeyLocation_intern
}
}
// Checks if `endpoint` is failed on a healthy server or not. Returns true if we need to refresh the location cache for
// the endpoint.
bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) {
if (IFailureMonitor::failureMonitor().onlyEndpointFailed(endpoint)) {
// This endpoint is failed, but the server is still healthy. There are two cases this can happen:
// - There is a recent bounce in the cluster where the endpoints in SSes get updated.
// - The SS is failed and terminated on a server, but the server is kept running.
// To account for the first case, we invalidate the cache and issue GetKeyLocation requests to the proxy to
// update the cache with the new SS points. However, if the failure is caused by the second case, the
// requested key location will continue to be the failed endpoint until the data movement is finished. But
// every read will generate a GetKeyLocation request to the proxies (and still getting the failed endpoint
// back), which may overload the proxy and affect data movement speed. Therefore, we only refresh the
// location cache for short period of time, and after the initial grace period that we keep retrying
// resolving key location, we will slow it down to resolve it only once every
// `LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL`.
cx->setFailedEndpointOnHealthyServer(endpoint);
const auto& failureInfo = cx->getEndpointFailureInfo(endpoint);
ASSERT(failureInfo.present());
if (now() - failureInfo.get().startTime < CLIENT_KNOBS->LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD ||
now() - failureInfo.get().lastRefreshTime > CLIENT_KNOBS->LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL) {
cx->updateFailedEndpointRefreshTime(endpoint);
return true;
}
} else {
cx->clearFailedEndpointOnHealthyServer(endpoint);
}
return false;
}
template <class F>
Future<std::pair<KeyRange, Reference<LocationInfo>>> getKeyLocation(Database const& cx,
Key const& key,
@ -2463,14 +2520,19 @@ Future<std::pair<KeyRange, Reference<LocationInfo>>> getKeyLocation(Database con
return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward);
}
bool onlyEndpointFailedAndNeedRefresh = false;
for (int i = 0; i < ssi.second->size(); i++) {
if (IFailureMonitor::failureMonitor().onlyEndpointFailed(ssi.second->get(i, member).getEndpoint())) {
cx->invalidateCache(key);
ssi.second.clear();
return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward);
if (checkOnlyEndpointFailed(cx, ssi.second->get(i, member).getEndpoint())) {
onlyEndpointFailedAndNeedRefresh = true;
}
}
if (onlyEndpointFailedAndNeedRefresh) {
cx->invalidateCache(key);
// Refresh the cache with a new getKeyLocations made to proxies.
return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward);
}
return ssi;
}
@ -2553,21 +2615,21 @@ Future<std::vector<std::pair<KeyRange, Reference<LocationInfo>>>> getKeyRangeLoc
bool foundFailed = false;
for (const auto& [range, locInfo] : locations) {
bool onlyEndpointFailed = false;
bool onlyEndpointFailedAndNeedRefresh = false;
for (int i = 0; i < locInfo->size(); i++) {
if (IFailureMonitor::failureMonitor().onlyEndpointFailed(locInfo->get(i, member).getEndpoint())) {
onlyEndpointFailed = true;
break;
if (checkOnlyEndpointFailed(cx, locInfo->get(i, member).getEndpoint())) {
onlyEndpointFailedAndNeedRefresh = true;
}
}
if (onlyEndpointFailed) {
if (onlyEndpointFailedAndNeedRefresh) {
cx->invalidateCache(range.begin);
foundFailed = true;
}
}
if (foundFailed) {
// Refresh the cache with a new getKeyRangeLocations made to proxies.
return getKeyRangeLocations_internal(cx, keys, limit, reverse, spanID, debugID, useProvisionalProxies);
}
@ -5095,7 +5157,7 @@ ACTOR static Future<Void> commitDummyTransaction(Reference<TransactionState> trS
return Void();
} catch (Error& e) {
TraceEvent("CommitDummyTransactionError")
.error(e, true)
.errorUnsuppressed(e)
.detail("Key", range.begin)
.detail("Retries", retries);
wait(tr.onError(e));
@ -5713,9 +5775,10 @@ ACTOR Future<GetReadVersionReply> getConsistentReadVersion(SpanID parentSpan,
loop {
try {
state GetReadVersionRequest req(span.context, transactionCount, priority, flags, tags, debugID);
state Future<Void> onProxiesChanged = cx->onProxiesChanged();
choose {
when(wait(cx->onProxiesChanged())) {}
when(wait(onProxiesChanged)) { onProxiesChanged = cx->onProxiesChanged(); }
when(GetReadVersionReply v =
wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies(
flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES)),
@ -6846,7 +6909,7 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
}
}
} catch (Error& e) {
TraceEvent("SnapCreateError").detail("SnapCmd", snapCmd.toString()).detail("UID", snapUID).error(e);
TraceEvent("SnapCreateError").error(e).detail("SnapCmd", snapCmd.toString()).detail("UID", snapUID);
throw;
}
}
@ -6874,13 +6937,14 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent("ExclusionSafetyCheckError")
.error(e)
.detail("NumExclusion", exclusions.size())
.detail("Exclusions", describe(exclusions))
.error(e);
.detail("Exclusions", describe(exclusions));
}
throw;
}
TraceEvent("ExclusionSafetyCheckCoordinators").log();
wait(cx->getConnectionRecord()->resolveHostnames());
state ClientCoordinators coordinatorList(cx->getConnectionRecord());
state std::vector<Future<Optional<LeaderInfo>>> leaderServers;
leaderServers.reserve(coordinatorList.clientLeaderServers.size());

View File

@ -2585,7 +2585,7 @@ void ReadYourWritesTransaction::debugLogRetries(Optional<Error> error) {
{
TraceEvent trace = TraceEvent("LongTransaction");
if (error.present())
trace.error(error.get(), true);
trace.errorUnsuppressed(error.get());
if (!transactionDebugInfo->transactionName.empty())
trace.detail("TransactionName", transactionDebugInfo->transactionName);
trace.detail("Elapsed", elapsed).detail("Retries", retries).detail("Committed", committed);

View File

@ -500,7 +500,7 @@ ACTOR Future<Optional<json_spirit::mObject>> tryReadJSONFile(std::string path) {
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
TraceEvent(SevWarn, errorEventType).error(e).suppressFor(60).detail("File", path);
TraceEvent(SevWarn, errorEventType).errorUnsuppressed(e).suppressFor(60).detail("File", path);
}
return Optional<json_spirit::mObject>();
@ -744,7 +744,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
// Attach err to trace event if present, otherwise extract some stuff from the response
if (err.present()) {
event.error(err.get());
event.errorUnsuppressed(err.get());
}
event.suppressFor(60);
if (!err.present()) {
@ -954,7 +954,7 @@ ACTOR Future<Void> listObjectsStream_impl(Reference<S3BlobStoreEndpoint> bstore,
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
TraceEvent(SevWarn, "S3BlobStoreEndpointListResultParseError")
.error(e)
.errorUnsuppressed(e)
.suppressFor(60)
.detail("Resource", fullResource);
throw http_bad_response();
@ -1080,7 +1080,7 @@ ACTOR Future<std::vector<std::string>> listBuckets_impl(Reference<S3BlobStoreEnd
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
TraceEvent(SevWarn, "S3BlobStoreEndpointListBucketResultParseError")
.error(e)
.errorUnsuppressed(e)
.suppressFor(60)
.detail("Resource", fullResource);
throw http_bad_response();

View File

@ -103,6 +103,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TLOG_POP_BATCH_SIZE, 1000 ); if ( randomize && BUGGIFY ) TLOG_POP_BATCH_SIZE = 10;
init( TLOG_POPPED_VER_LAG_THRESHOLD_FOR_TLOGPOP_TRACE, 250e6 );
init( ENABLE_DETAILED_TLOG_POP_TRACE, false ); if ( randomize && BUGGIFY ) ENABLE_DETAILED_TLOG_POP_TRACE = true;
init( PEEK_BATCHING_EMPTY_MSG, false ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG = true;
init( PEEK_BATCHING_EMPTY_MSG_INTERVAL, 0.001 ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG_INTERVAL = 0.01;
// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
init( MAX_FORKED_PROCESS_OUTPUT, 1024 );
@ -362,7 +364,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 );
// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE, true );
init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 );
// Leader election
bool longLeaderElection = randomize && BUGGIFY;
@ -579,6 +582,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( MIN_AVAILABLE_SPACE, 1e8 );
init( MIN_AVAILABLE_SPACE_RATIO, 0.05 );
init( MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER, 0.01 );
init( TARGET_AVAILABLE_SPACE_RATIO, 0.30 );
init( AVAILABLE_SPACE_UPDATE_DELAY, 5.0 );

View File

@ -106,6 +106,8 @@ public:
double PUSH_STATS_SLOW_AMOUNT;
double PUSH_STATS_SLOW_RATIO;
int TLOG_POP_BATCH_SIZE;
bool PEEK_BATCHING_EMPTY_MSG;
double PEEK_BATCHING_EMPTY_MSG_INTERVAL;
// Data distribution queue
double HEALTH_POLL_TIME;
@ -293,6 +295,8 @@ public:
bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead
double ROCKSDB_PERFCONTEXT_SAMPLE_RATE;
// Leader election
int MAX_NOTIFICATIONS;
@ -525,6 +529,7 @@ public:
int64_t MIN_AVAILABLE_SPACE;
double MIN_AVAILABLE_SPACE_RATIO;
double MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER;
double TARGET_AVAILABLE_SPACE_RATIO;
double AVAILABLE_SPACE_UPDATE_DELAY;

View File

@ -1628,8 +1628,9 @@ Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw, K
ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
state Reference<IQuorumChange> change;
state std::vector<NetworkAddress> addressesVec;
state std::vector<std::string> process_address_strs;
state ClusterConnectionString
conn; // We don't care about the Key here, it will be overrode in changeQuorumChecker().
state std::vector<std::string> process_address_or_hostname_strs;
state Optional<std::string> msg;
state int index;
state bool parse_error = false;
@ -1640,38 +1641,45 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
if (processes_entry.first) {
ASSERT(processes_entry.second.present()); // no clear should be seen here
auto processesStr = processes_entry.second.get().toString();
boost::split(process_address_strs, processesStr, [](char c) { return c == ','; });
if (!process_address_strs.size()) {
boost::split(process_address_or_hostname_strs, processesStr, [](char c) { return c == ','; });
if (!process_address_or_hostname_strs.size()) {
return ManagementAPIError::toJsonString(
false,
"coordinators",
"New coordinators\' processes are empty, please specify new processes\' network addresses with format "
"\"IP:PORT,IP:PORT,...,IP:PORT\"");
"\"IP:PORT,IP:PORT,...,IP:PORT\" or \"HOSTNAME:PORT,HOSTNAME:PORT,...,HOSTNAME:PORT\"");
}
for (index = 0; index < process_address_strs.size(); index++) {
for (index = 0; index < process_address_or_hostname_strs.size(); index++) {
try {
auto a = NetworkAddress::parse(process_address_strs[index]);
if (!a.isValid())
parse_error = true;
else
addressesVec.push_back(a);
if (Hostname::isHostname(process_address_or_hostname_strs[index])) {
conn.hostnames.push_back(Hostname::parse(process_address_or_hostname_strs[index]));
conn.status = ClusterConnectionString::ConnectionStringStatus::UNRESOLVED;
} else {
NetworkAddress a = NetworkAddress::parse(process_address_or_hostname_strs[index]);
if (!a.isValid()) {
parse_error = true;
} else {
conn.coords.push_back(a);
}
}
} catch (Error& e) {
TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
parse_error = true;
}
if (parse_error) {
std::string error =
"ERROR: \'" + process_address_strs[index] + "\' is not a valid network endpoint address\n";
if (process_address_strs[index].find(":tls") != std::string::npos)
std::string error = "ERROR: \'" + process_address_or_hostname_strs[index] +
"\' is not a valid network endpoint address\n";
if (process_address_or_hostname_strs[index].find(":tls") != std::string::npos)
error += " Do not include the `:tls' suffix when naming a process\n";
return ManagementAPIError::toJsonString(false, "coordinators", error);
}
}
}
if (addressesVec.size())
change = specifiedQuorumChange(addressesVec);
wait(conn.resolveHostnames());
if (conn.coordinators().size())
change = specifiedQuorumChange(conn.coordinators());
else
change = noQuorumChange();
@ -1693,10 +1701,11 @@ ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWrite
ASSERT(change.isValid());
TraceEvent(SevDebug, "SKSChangeCoordinatorsStart")
.detail("NewAddresses", describe(addressesVec))
.detail("NewHostnames", conn.hostnames.size() ? describe(conn.hostnames) : "N/A")
.detail("NewAddresses", describe(conn.coordinators()))
.detail("Description", entry.first ? entry.second.get().toString() : "");
Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &addressesVec));
Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &conn));
TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish")
.detail("Result", r.present() ? static_cast<int>(r.get()) : -1); // -1 means success

View File

@ -306,6 +306,7 @@ ACTOR Future<Optional<StatusObject>> clientCoordinatorsStatusFetcher(Reference<I
bool* quorum_reachable,
int* coordinatorsFaultTolerance) {
try {
wait(connRecord->resolveHostnames());
state ClientCoordinators coord(connRecord);
state StatusObject statusObj;

View File

@ -35,3 +35,13 @@ add_custom_target(start_sandbox
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
add_dependencies(start_sandbox fdbmonitor fdbserver)
if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
configure_file(${CMAKE_SOURCE_DIR}/contrib/generate_profile.sh
${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
endif()
add_custom_target(generate_profile
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)

View File

@ -260,7 +260,7 @@ public:
std::string currentFilename =
(wrappedFile.isReady() && !wrappedFile.isError()) ? wrappedFile.get()->getFilename() : actualFilename;
currentProcess->machine->openFiles.erase(currentFilename);
//TraceEvent("AsyncFileNonDurableOpenError").error(e, true).detail("Filename", filename).detail("Address", currentProcess->address).detail("Addr", g_simulator.getCurrentProcess()->address);
//TraceEvent("AsyncFileNonDurableOpenError").errorUnsuppressed(e).detail("Filename", filename).detail("Address", currentProcess->address).detail("Addr", g_simulator.getCurrentProcess()->address);
wait(g_simulator.onProcess(currentProcess, currentTaskID));
throw err;
}

View File

@ -732,13 +732,13 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,
if (self->compatible) {
TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID())
.error(e, true)
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("PeerAddr", self->destination);
} else {
TraceEvent(
ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID())
.error(e, true)
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("PeerAddr", self->destination);
}
@ -783,7 +783,7 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,
if (self->peerReferences <= 0 && self->reliable.empty() && self->unsent.empty() &&
self->outstandingReplies == 0) {
TraceEvent("PeerDestroy").error(e).suppressFor(1.0).detail("PeerAddr", self->destination);
TraceEvent("PeerDestroy").errorUnsuppressed(e).suppressFor(1.0).detail("PeerAddr", self->destination);
self->connect.cancel();
self->transport->peers.erase(self->destination);
self->transport->orderedAddresses.erase(self->destination);
@ -1330,10 +1330,12 @@ ACTOR static Future<Void> connectionIncoming(TransportData* self, Reference<ICon
}
return Void();
} catch (Error& e) {
TraceEvent("IncomingConnectionError", conn->getDebugID())
.error(e)
.suppressFor(1.0)
.detail("FromAddress", conn->getPeerAddress());
if (e.code() != error_code_actor_cancelled) {
TraceEvent("IncomingConnectionError", conn->getDebugID())
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("FromAddress", conn->getPeerAddress());
}
conn->close();
return Void();
}

View File

@ -29,12 +29,12 @@ void HealthMonitor::reportPeerClosed(const NetworkAddress& peerAddress) {
}
void HealthMonitor::purgeOutdatedHistory() {
for (auto it = peerClosedHistory.begin(); it != peerClosedHistory.end();) {
if (it->first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) {
auto& count = peerClosedNum[it->second];
while (!peerClosedHistory.empty()) {
auto const& p = peerClosedHistory.front();
if (p.first < now() - FLOW_KNOBS->HEALTH_MONITOR_CLIENT_REQUEST_INTERVAL_SECS) {
auto& count = peerClosedNum[p.second];
--count;
ASSERT(count >= 0);
++it; // Increment before pop_front to avoid iterator invalidation
peerClosedHistory.pop_front();
} else {
break;

View File

@ -1123,11 +1123,9 @@ public:
}
}
ACTOR static Future<Void> runLoop(Sim2* self) {
state ISimulator::ProcessInfo* callingMachine = self->currentProcess;
static void runLoop(Sim2* self) {
ISimulator::ProcessInfo* callingMachine = self->currentProcess;
while (!self->isStopped) {
wait(self->net2->yield(TaskPriority::DefaultYield));
self->mutex.enter();
if (self->tasks.size() == 0) {
self->mutex.leave();
@ -1144,18 +1142,13 @@ public:
self->yielded = false;
}
self->currentProcess = callingMachine;
self->net2->stop();
for (auto& fn : self->stopCallbacks) {
fn();
}
return Void();
}
// Implement ISimulator interface
void run() override {
Future<Void> loopFuture = runLoop(this);
net2->run();
}
void run() override { runLoop(this); }
ProcessInfo* newProcess(const char* name,
IPAddress ip,
uint16_t port,
@ -2094,7 +2087,7 @@ public:
t.action.send(Void());
ASSERT(this->currentProcess == t.machine);
} catch (Error& e) {
TraceEvent(SevError, "UnhandledSimulationEventError").error(e, true);
TraceEvent(SevError, "UnhandledSimulationEventError").errorUnsuppressed(e);
killProcess(t.machine, KillInstantly);
}

View File

@ -1101,10 +1101,10 @@ ACTOR Future<Void> backupWorker(BackupInterface interf,
try {
wait(done);
} catch (Error& e) {
TraceEvent("BackupWorkerShutdownError", self.myId).error(e, true);
TraceEvent("BackupWorkerShutdownError", self.myId).errorUnsuppressed(e);
}
}
TraceEvent("BackupWorkerTerminated", self.myId).error(err, true);
TraceEvent("BackupWorkerTerminated", self.myId).errorUnsuppressed(err);
if (err.code() != error_code_actor_cancelled && err.code() != error_code_worker_removed) {
throw err;
}

View File

@ -843,8 +843,8 @@ ACTOR Future<Void> monitorBlobWorkerStatus(BlobManagerData* bmData, BlobWorkerIn
}
// TODO change back from SevError?
TraceEvent(SevError, "BWStatusMonitoringFailed", bmData->id)
.detail("BlobWorkerID", bwInterf.id())
.error(e);
.error(e)
.detail("BlobWorkerID", bwInterf.id());
throw e;
}
}
@ -877,7 +877,7 @@ ACTOR Future<Void> monitorBlobWorker(BlobManagerData* bmData, BlobWorkerInterfac
printf("BM got unexpected error %s monitoring BW %s\n", e.name(), bwInterf.id().toString().c_str());
}
// TODO change back from SevError?
TraceEvent(SevError, "BWMonitoringFailed", bmData->id).detail("BlobWorkerID", bwInterf.id()).error(e);
TraceEvent(SevError, "BWMonitoringFailed", bmData->id).error(e).detail("BlobWorkerID", bwInterf.id());
throw e;
}
@ -1152,7 +1152,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
}
}
} catch (Error& err) {
TraceEvent("BlobManagerDied", bmInterf.id()).error(err, true);
TraceEvent("BlobManagerDied", bmInterf.id()).errorUnsuppressed(err);
}
return Void();
}

View File

@ -1589,7 +1589,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
metadata->keyRange.end.printable().c_str(),
e.name());
}
TraceEvent(SevWarn, "GranuleFileUpdaterError", bwData->id).detail("Granule", metadata->keyRange).error(e);
TraceEvent(SevWarn, "GranuleFileUpdaterError", bwData->id).error(e).detail("Granule", metadata->keyRange);
if (granuleCanRetry(e)) {
// explicitly cancel all outstanding write futures BEFORE updating promise stream, to ensure they
@ -2621,7 +2621,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
if (BW_DEBUG) {
printf("Blob worker got error %s. Exiting...\n", e.name());
}
TraceEvent("BlobWorkerDied", self->id).error(e, true);
TraceEvent("BlobWorkerDied", self->id).errorUnsuppressed(e);
}
wait(self->granuleMetadata.clearAsync());

View File

@ -90,6 +90,7 @@ set(FDBSERVER_SRCS
QuietDatabase.actor.cpp
QuietDatabase.h
RadixTree.h
Ratekeeper.h
Ratekeeper.actor.cpp
RatekeeperInterface.h
RecoveryState.h
@ -130,6 +131,8 @@ set(FDBSERVER_SRCS
storageserver.actor.cpp
TagPartitionedLogSystem.actor.cpp
TagPartitionedLogSystem.actor.h
TagThrottler.actor.cpp
TagThrottler.h
template_fdb.h
TCInfo.actor.cpp
TCInfo.h

View File

@ -296,7 +296,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
TraceEvent(SevWarn, "DetectedFailedRecovery", cluster->id).detail("OldMaster", iMaster.id());
} catch (Error& e) {
state Error err = e;
TraceEvent("CCWDB", cluster->id).error(e, true).detail("Master", iMaster.id());
TraceEvent("CCWDB", cluster->id).errorUnsuppressed(e).detail("Master", iMaster.id());
if (e.code() != error_code_actor_cancelled)
wait(delay(0.0));
@ -313,7 +313,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
TEST(err.code() == error_code_restart_cluster_controller); // Terminated due to cluster-controller restart.
if (cluster->shouldCommitSuicide || err.code() == error_code_coordinators_changed) {
TraceEvent("ClusterControllerTerminate", cluster->id).error(err, true);
TraceEvent("ClusterControllerTerminate", cluster->id).errorUnsuppressed(err);
throw restart_cluster_controller();
}
@ -427,10 +427,10 @@ void checkOutstandingStorageRequests(ClusterControllerData* self) {
} catch (Error& e) {
if (e.code() == error_code_no_more_servers) {
TraceEvent(SevWarn, "RecruitStorageNotAvailable", self->id)
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("OutstandingReq", i)
.detail("IsCriticalRecruitment", req.first.criticalRecruitment)
.error(e);
.detail("IsCriticalRecruitment", req.first.criticalRecruitment);
} else {
TraceEvent(SevError, "RecruitStorageError", self->id).error(e);
throw;
@ -464,9 +464,9 @@ void checkOutstandingBlobWorkerRequests(ClusterControllerData* self) {
} catch (Error& e) {
if (e.code() == error_code_no_more_servers) {
TraceEvent(SevWarn, "RecruitBlobWorkerNotAvailable", self->id)
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("OutstandingReq", i)
.error(e);
.detail("OutstandingReq", i);
} else {
TraceEvent(SevError, "RecruitBlobWorkerError", self->id).error(e);
throw;
@ -876,8 +876,8 @@ void clusterRecruitStorage(ClusterControllerData* self, RecruitStorageRequest re
if (e.code() == error_code_no_more_servers) {
self->outstandingStorageRequests.emplace_back(req, now() + SERVER_KNOBS->RECRUITMENT_TIMEOUT);
TraceEvent(SevWarn, "RecruitStorageNotAvailable", self->id)
.detail("IsCriticalRecruitment", req.criticalRecruitment)
.error(e);
.error(e)
.detail("IsCriticalRecruitment", req.criticalRecruitment);
} else {
TraceEvent(SevError, "RecruitStorageError", self->id).error(e);
throw; // Any other error will bring down the cluster controller
@ -2599,6 +2599,7 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
state bool hasConnected = false;
loop {
try {
wait(connRecord->resolveHostnames());
ServerCoordinators coordinators(connRecord);
wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
} catch (Error& e) {

View File

@ -1896,8 +1896,8 @@ public:
throw;
}
TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDc", id)
.detail("SetPrimaryDesired", setPrimaryDesired)
.error(e);
.error(e)
.detail("SetPrimaryDesired", setPrimaryDesired);
auto reply = findWorkersForConfigurationFromDC(req, regions[1].dcId, checkGoodRecruitment);
if (!setPrimaryDesired) {
std::vector<Optional<Key>> dcPriority;

View File

@ -673,7 +673,9 @@ ACTOR Future<Void> changeCoordinators(Reference<ClusterRecoveryData> self) {
}
try {
wait(self->cstate.move(ClusterConnectionString(changeCoordinatorsRequest.newConnectionString.toString())));
state ClusterConnectionString conn(changeCoordinatorsRequest.newConnectionString.toString());
wait(conn.resolveHostnames());
wait(self->cstate.move(conn));
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
changeCoordinatorsRequest.reply.sendError(e);

View File

@ -1769,17 +1769,17 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
wait(throwErrorOr(ddSnapReq));
} catch (Error& e) {
TraceEvent("SnapCommitProxy_DDSnapResponseError")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID)
.error(e, true /*includeCancelled*/);
.detail("SnapUID", snapReq.snapUID);
throw e;
}
snapReq.reply.send(Void());
} catch (Error& e) {
TraceEvent("SnapCommitProxy_SnapReqError")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID)
.error(e, true /*includeCancelled*/);
.detail("SnapUID", snapReq.snapUID);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
@ -2188,7 +2188,7 @@ ACTOR Future<Void> commitProxyServer(CommitProxyInterface proxy,
whitelistBinPaths);
wait(core || checkRemoved(db, req.recoveryCount, proxy));
} catch (Error& e) {
TraceEvent("CommitProxyTerminated", proxy.id()).error(e, true);
TraceEvent("CommitProxyTerminated", proxy.id()).errorUnsuppressed(e);
if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped &&
e.code() != error_code_tlog_failed && e.code() != error_code_coordinators_changed &&

View File

@ -61,7 +61,7 @@ class WriteToTransactionEnvironment {
Version lastWrittenVersion{ 0 };
static Value longToValue(int64_t v) {
auto s = format("%ld", v);
auto s = format("%lld", v);
return StringRef(reinterpret_cast<uint8_t const*>(s.c_str()), s.size());
}

View File

@ -96,6 +96,7 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local) : Client
}
ServerCoordinators::ServerCoordinators(Reference<IClusterConnectionRecord> ccr) : ClientCoordinators(ccr) {
ASSERT(ccr->connectionStringStatus() == ClusterConnectionString::RESOLVED);
ClusterConnectionString cs = ccr->getConnectionString();
for (auto s = cs.coordinators().begin(); s != cs.coordinators().end(); ++s) {
leaderElectionServers.emplace_back(*s);
@ -205,8 +206,11 @@ ACTOR Future<Void> openDatabase(ClientData* db,
int* clientCount,
Reference<AsyncVar<bool>> hasConnectedClients,
OpenDatabaseCoordRequest req,
Future<Void> checkStuck) {
Future<Void> checkStuck,
Reference<AsyncVar<Void>> coordinatorsChanged) {
state ErrorOr<CachedSerialization<ClientDBInfo>> replyContents;
state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
state Future<Void> clientInfoOnChange = db->clientInfo->onChange();
++(*clientCount);
hasConnectedClients->set(true);
@ -223,7 +227,15 @@ ACTOR Future<Void> openDatabase(ClientData* db,
replyContents = failed_to_progress();
break;
}
when(wait(yieldedFuture(db->clientInfo->onChange()))) { replyContents = db->clientInfo->get(); }
when(wait(yieldedFuture(clientInfoOnChange))) {
clientInfoOnChange = db->clientInfo->onChange();
replyContents = db->clientInfo->get();
}
when(wait(coordinatorsChangedOnChange)) {
coordinatorsChangedOnChange = coordinatorsChanged->onChange();
replyContents = coordinators_changed();
break;
}
when(wait(delayJittered(SERVER_KNOBS->CLIENT_REGISTER_INTERVAL))) {
if (db->clientInfo->get().read().id.isValid()) {
replyContents = db->clientInfo->get();
@ -254,18 +266,33 @@ ACTOR Future<Void> openDatabase(ClientData* db,
ACTOR Future<Void> remoteMonitorLeader(int* clientCount,
Reference<AsyncVar<bool>> hasConnectedClients,
Reference<AsyncVar<Optional<LeaderInfo>>> currentElectedLeader,
ElectionResultRequest req) {
ElectionResultRequest req,
Reference<AsyncVar<Void>> coordinatorsChanged) {
state bool coordinatorsChangeDetected = false;
state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
state Future<Void> currentElectedLeaderOnChange = currentElectedLeader->onChange();
++(*clientCount);
hasConnectedClients->set(true);
while (!currentElectedLeader->get().present() || req.knownLeader == currentElectedLeader->get().get().changeID) {
choose {
when(wait(yieldedFuture(currentElectedLeader->onChange()))) {}
when(wait(yieldedFuture(currentElectedLeaderOnChange))) {
currentElectedLeaderOnChange = currentElectedLeader->onChange();
}
when(wait(coordinatorsChangedOnChange)) {
coordinatorsChangedOnChange = coordinatorsChanged->onChange();
coordinatorsChangeDetected = true;
break;
}
when(wait(delayJittered(SERVER_KNOBS->CLIENT_REGISTER_INTERVAL))) { break; }
}
}
req.reply.send(currentElectedLeader->get());
if (coordinatorsChangeDetected) {
req.reply.sendError(coordinators_changed());
} else {
req.reply.send(currentElectedLeader->get());
}
if (--(*clientCount) == 0) {
hasConnectedClients->set(false);
@ -296,6 +323,9 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
state Reference<AsyncVar<Optional<LeaderInfo>>> currentElectedLeader =
makeReference<AsyncVar<Optional<LeaderInfo>>>();
state LivenessChecker canConnectToLeader(SERVER_KNOBS->COORDINATOR_LEADER_CONNECTION_TIMEOUT);
state Reference<AsyncVar<Void>> coordinatorsChanged = makeReference<AsyncVar<Void>>();
state Future<Void> coordinatorsChangedOnChange = coordinatorsChanged->onChange();
state Future<Void> hasConnectedClientsOnChange = hasConnectedClients->onChange();
loop choose {
when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
@ -306,10 +336,14 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
} else {
if (!leaderMon.isValid()) {
leaderMon = monitorLeaderAndGetClientInfo(
req.clusterKey, req.coordinators, &clientData, currentElectedLeader);
req.clusterKey, req.coordinators, &clientData, currentElectedLeader, coordinatorsChanged);
}
actors.add(
openDatabase(&clientData, &clientCount, hasConnectedClients, req, canConnectToLeader.checkStuck()));
actors.add(openDatabase(&clientData,
&clientCount,
hasConnectedClients,
req,
canConnectToLeader.checkStuck(),
coordinatorsChanged));
}
}
when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) {
@ -318,10 +352,11 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
req.reply.send(currentElectedLeader->get());
} else {
if (!leaderMon.isValid()) {
leaderMon =
monitorLeaderAndGetClientInfo(req.key, req.coordinators, &clientData, currentElectedLeader);
leaderMon = monitorLeaderAndGetClientInfo(
req.key, req.coordinators, &clientData, currentElectedLeader, coordinatorsChanged);
}
actors.add(remoteMonitorLeader(&clientCount, hasConnectedClients, currentElectedLeader, req));
actors.add(remoteMonitorLeader(
&clientCount, hasConnectedClients, currentElectedLeader, req, coordinatorsChanged));
}
}
when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) {
@ -454,13 +489,18 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
notify.pop_front();
}
}
when(wait(hasConnectedClients->onChange())) {
when(wait(hasConnectedClientsOnChange)) {
hasConnectedClientsOnChange = hasConnectedClients->onChange();
if (!hasConnectedClients->get() && !nextInterval.isValid()) {
TraceEvent("LeaderRegisterUnneeded").detail("Key", key);
return Void();
}
}
when(wait(actors.getResult())) {}
when(wait(coordinatorsChangedOnChange)) {
leaderMon = Future<Void>();
coordinatorsChangedOnChange = coordinatorsChanged->onChange();
}
}
}
@ -756,7 +796,7 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder,
store.getError() || configDatabaseServer);
throw internal_error();
} catch (Error& e) {
TraceEvent("CoordinationServerError", myID).error(e, true);
TraceEvent("CoordinationServerError", myID).errorUnsuppressed(e);
throw;
}
}

View File

@ -178,7 +178,7 @@ class WorkPool final : public IThreadPool, public ReferenceCounted<WorkPool<Thre
stopped.send(Void());
return;
} catch (Error& e) {
TraceEvent("WorkPoolError").error(e, true);
TraceEvent("WorkPoolError").errorUnsuppressed(e);
error.sendError(e);
} catch (...) {
TraceEvent("WorkPoolError").log();
@ -256,10 +256,10 @@ public:
pool->queueLock.enter();
TraceEvent("WorkPool_Stop")
.errorUnsuppressed(e)
.detail("Workers", pool->workers.size())
.detail("Idle", pool->idle.size())
.detail("Work", pool->work.size())
.error(e, true);
.detail("Work", pool->work.size());
for (uint32_t i = 0; i < pool->work.size(); i++)
pool->work[i]->cancel(); // What if cancel() does something to this?

View File

@ -154,7 +154,7 @@ class WorkPool final : public IThreadPool, public ReferenceCounted<WorkPool<Thre
stopped.send(Void());
return;
} catch (Error& e) {
TraceEvent("WorkPoolError").error(e, true);
TraceEvent("WorkPoolError").errorUnsuppressed(e);
error.sendError(e);
} catch (...) {
TraceEvent("WorkPoolError").log();
@ -232,10 +232,10 @@ public:
pool->queueLock.enter();
TraceEvent("WorkPool_Stop")
.errorUnsuppressed(e)
.detail("Workers", pool->workers.size())
.detail("Idle", pool->idle.size())
.detail("Work", pool->work.size())
.error(e, true);
.detail("Work", pool->work.size());
for (uint32_t i = 0; i < pool->work.size(); i++)
pool->work[i]->cancel(); // What if cancel() does something to this?

File diff suppressed because it is too large Load Diff

View File

@ -171,6 +171,7 @@ typedef AsyncMap<UID, ServerStatus> ServerStatusMap;
class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
friend class DDTeamCollectionImpl;
friend class DDTeamCollectionUnitTest;
enum class Status { NONE = 0, WIGGLING = 1, EXCLUDED = 2, FAILED = 3 };
@ -521,6 +522,37 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
void noHealthyTeams() const;
// To enable verbose debug info, set shouldPrint to true
void traceAllInfo(bool shouldPrint = false) const;
// Check if the server belongs to a machine; if not, create the machine.
// Establish the two-direction link between server and machine
Reference<TCMachineInfo> checkAndCreateMachine(Reference<TCServerInfo> server);
// Group storage servers (process) based on their machineId in LocalityData
// All created machines are healthy
// Return The number of healthy servers we grouped into machines
int constructMachinesFromServers();
// Create machineTeamsToBuild number of machine teams
// No operation if machineTeamsToBuild is 0
// Note: The creation of machine teams should not depend on server teams:
// No matter how server teams will be created, we will create the same set of machine teams;
// We should never use server team number in building machine teams.
//
// Five steps to create each machine team, which are document in the function
// Reuse ReplicationPolicy selectReplicas func to select machine team
// return number of added machine teams
int addBestMachineTeams(int machineTeamsToBuild);
// Sanity check the property of teams in unit test
// Return true if all server teams belong to machine teams
bool sanityCheckTeams() const;
void disableBuildingTeams() { doBuildTeams = false; }
void setCheckTeamDelay() { this->checkTeamDelay = Void(); }
public:
Database cx;
@ -595,39 +627,6 @@ public:
void addTeam(std::set<UID> const& team, bool isInitialTeam) { addTeam(team.begin(), team.end(), isInitialTeam); }
// FIXME: Public for testing only
void disableBuildingTeams() { doBuildTeams = false; }
// FIXME: Public for testing only
void setCheckTeamDelay() { this->checkTeamDelay = Void(); }
// FIXME: Public for testing only
// Group storage servers (process) based on their machineId in LocalityData
// All created machines are healthy
// Return The number of healthy servers we grouped into machines
int constructMachinesFromServers();
// FIXME: Public for testing only
// To enable verbose debug info, set shouldPrint to true
void traceAllInfo(bool shouldPrint = false) const;
// FIXME: Public for testing only
// Create machineTeamsToBuild number of machine teams
// No operation if machineTeamsToBuild is 0
// Note: The creation of machine teams should not depend on server teams:
// No matter how server teams will be created, we will create the same set of machine teams;
// We should never use server team number in building machine teams.
//
// Five steps to create each machine team, which are document in the function
// Reuse ReplicationPolicy selectReplicas func to select machine team
// return number of added machine teams
int addBestMachineTeams(int machineTeamsToBuild);
// FIXME: Public for testing only
// Sanity check the property of teams in unit test
// Return true if all server teams belong to machine teams
bool sanityCheckTeams() const;
// Create server teams based on machine teams
// Before the number of machine teams reaches the threshold, build a machine team for each server team
// When it reaches the threshold, first try to build a server team with existing machine teams; if failed,
@ -642,11 +641,6 @@ public:
bool removeTeam(Reference<TCTeamInfo> team);
// FIXME: Public for testing only
// Check if the server belongs to a machine; if not, create the machine.
// Establish the two-direction link between server and machine
Reference<TCMachineInfo> checkAndCreateMachine(Reference<TCServerInfo> server);
void removeTSS(UID removedServer);
void removeServer(UID removedServer);

View File

@ -865,7 +865,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
}
bool ddEnabled = wait(isDataDistributionEnabled(cx, ddEnabledState));
TraceEvent("DataDistributionMoveKeysConflict").detail("DataDistributionEnabled", ddEnabled).error(err);
TraceEvent("DataDistributionMoveKeysConflict").error(err).detail("DataDistributionEnabled", ddEnabled);
if (ddEnabled) {
throw err;
}
@ -891,7 +891,7 @@ Future<Void> sendSnapReq(RequestStream<Req> stream, Req req, Error e) {
ErrorOr<REPLY_TYPE(Req)> reply = wait(stream.tryGetReply(req));
if (reply.isError()) {
TraceEvent("SnapDataDistributor_ReqError")
.error(reply.getError(), true)
.errorUnsuppressed(reply.getError())
.detail("ConvertedErrorType", e.what())
.detail("Peer", stream.getEndpoint().getPrimaryAddress());
throw e;
@ -1012,9 +1012,9 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
} catch (Error& err) {
state Error e = err;
TraceEvent("SnapDataDistributor_SnapReqExit")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID)
.error(e, true /*includeCancelled */);
.detail("SnapUID", snapReq.snapUID);
if (e.code() == error_code_snap_storage_failed || e.code() == error_code_snap_tlog_failed ||
e.code() == error_code_operation_cancelled || e.code() == error_code_snap_disable_tlog_pop_failed) {
// enable tlog pop on local tlog nodes
@ -1072,9 +1072,9 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq,
}
} catch (Error& e) {
TraceEvent("SnapDDCreateError")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID)
.error(e, true /*includeCancelled */);
.detail("SnapUID", snapReq.snapUID);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
@ -1251,10 +1251,10 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
}
} catch (Error& err) {
if (normalDataDistributorErrors().count(err.code()) == 0) {
TraceEvent("DataDistributorError", di.id()).error(err, true);
TraceEvent("DataDistributorError", di.id()).errorUnsuppressed(err);
throw err;
}
TraceEvent("DataDistributorDied", di.id()).error(err, true);
TraceEvent("DataDistributorDied", di.id()).errorUnsuppressed(err);
}
return Void();

View File

@ -1265,10 +1265,12 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
}
}
} catch (Error& e) {
TraceEvent(relocateShardInterval.end(), distributorId).error(e, true).detail("Duration", now() - startTime);
TraceEvent(relocateShardInterval.end(), distributorId)
.errorUnsuppressed(e)
.detail("Duration", now() - startTime);
if (now() - startTime > 600) {
TraceEvent(SevWarnAlways, "RelocateShardTooLong")
.error(e, true)
.errorUnsuppressed(e)
.detail("Duration", now() - startTime)
.detail("Dest", describe(destIds))
.detail("Src", describe(rd.src));
@ -1540,8 +1542,8 @@ ACTOR Future<Void> BgDDMountainChopper(DDQueueData* self, int teamCollectionInde
traceEvent.detail("ResetCount", resetCount);
tr.reset();
} catch (Error& e) {
traceEvent.error(
e, true); // Log actor_cancelled because it's not legal to suppress an event that's initialized
// Log actor_cancelled because it's not legal to suppress an event that's initialized
traceEvent.errorUnsuppressed(e);
wait(tr.onError(e));
}
@ -1655,8 +1657,8 @@ ACTOR Future<Void> BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex)
traceEvent.detail("ResetCount", resetCount);
tr.reset();
} catch (Error& e) {
traceEvent.error(
e, true); // Log actor_cancelled because it's not legal to suppress an event that's initialized
// Log actor_cancelled because it's not legal to suppress an event that's initialized
traceEvent.errorUnsuppressed(e);
wait(tr.onError(e));
}

View File

@ -492,7 +492,9 @@ public:
delete pageMem;
TEST(true); // push error
TEST(2 == syncFiles.size()); // push spanning both files error
TraceEvent(SevError, "RDQPushAndCommitError", dbgid).error(e, true).detail("InitialFilename0", filename);
TraceEvent(SevError, "RDQPushAndCommitError", dbgid)
.errorUnsuppressed(e)
.detail("InitialFilename0", filename);
if (errorPromise.canBeSet())
errorPromise.sendError(e);
@ -612,7 +614,7 @@ public:
.detail("File0", self->filename(0));
} catch (Error& e) {
TraceEvent(SevError, "DiskQueueShutdownError", self->dbgid)
.error(e, true)
.errorUnsuppressed(e)
.detail("Reason", e.code() == error_code_platform_error ? "could not delete database" : "unknown");
error = e;
}
@ -731,7 +733,7 @@ public:
} catch (Error& e) {
bool ok = e.code() == error_code_file_not_found;
TraceEvent(ok ? SevInfo : SevError, "RDQReadFirstAndLastPagesError", self->dbgid)
.error(e, true)
.errorUnsuppressed(e)
.detail("File0Name", self->files[0].dbgFilename);
if (!self->error.isSet())
self->error.sendError(e);
@ -804,7 +806,7 @@ public:
} catch (Error& e) {
TEST(true); // Read next page error
TraceEvent(SevError, "RDQReadNextPageError", self->dbgid)
.error(e, true)
.errorUnsuppressed(e)
.detail("File0Name", self->files[0].dbgFilename);
if (!self->error.isSet())
self->error.sendError(e);

View File

@ -58,7 +58,7 @@ ACTOR Future<Void> encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface,
}
}
} catch (Error& e) {
TraceEvent("EKP_Terminated", ekpInterface.id()).error(e, true);
TraceEvent("EKP_Terminated", ekpInterface.id()).errorUnsuppressed(e);
}
return Void();

View File

@ -991,7 +991,7 @@ ACTOR Future<Void> grvProxyServer(GrvProxyInterface proxy,
state Future<Void> core = grvProxyServerCore(proxy, req.master, req.masterLifetime, db);
wait(core || checkRemoved(db, req.recoveryCount, proxy));
} catch (Error& e) {
TraceEvent("GrvProxyTerminated", proxy.id()).error(e, true);
TraceEvent("GrvProxyTerminated", proxy.id()).errorUnsuppressed(e);
if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped &&
e.code() != error_code_tlog_failed && e.code() != error_code_coordinators_changed &&

View File

@ -634,7 +634,7 @@ private:
} catch (Error& e) {
bool ok = e.code() == error_code_operation_cancelled || e.code() == error_code_file_not_found ||
e.code() == error_code_disk_adapter_reset;
TraceEvent(ok ? SevInfo : SevError, "ErrorDuringRecovery", dbgid).error(e, true);
TraceEvent(ok ? SevInfo : SevError, "ErrorDuringRecovery", dbgid).errorUnsuppressed(e);
if (e.code() != error_code_disk_adapter_reset) {
throw e;
}

View File

@ -11,6 +11,8 @@
#include <rocksdb/version.h>
#include <rocksdb/utilities/table_properties_collectors.h>
#include <rocksdb/rate_limiter.h>
#include <rocksdb/perf_context.h>
#include <rocksdb/c.h>
#if defined __has_include
#if __has_include(<liburing.h>)
#include <liburing.h>
@ -312,6 +314,271 @@ private:
uint64_t iteratorsReuseCount;
};
class PerfContextMetrics {
public:
PerfContextMetrics();
void reset();
void set(int index);
void log(bool ignoreZeroMetric);
private:
std::vector<std::tuple<const char*, int, std::vector<uint64_t>>> metrics;
uint64_t getRocksdbPerfcontextMetric(int metric);
};
PerfContextMetrics::PerfContextMetrics() {
metrics = {
{ "UserKeyComparisonCount", rocksdb_user_key_comparison_count, {} },
{ "BlockCacheHitCount", rocksdb_block_cache_hit_count, {} },
{ "BlockReadCount", rocksdb_block_read_count, {} },
{ "BlockReadByte", rocksdb_block_read_byte, {} },
{ "BlockReadTime", rocksdb_block_read_time, {} },
{ "BlockChecksumTime", rocksdb_block_checksum_time, {} },
{ "BlockDecompressTime", rocksdb_block_decompress_time, {} },
{ "GetReadBytes", rocksdb_get_read_bytes, {} },
{ "MultigetReadBytes", rocksdb_multiget_read_bytes, {} },
{ "IterReadBytes", rocksdb_iter_read_bytes, {} },
{ "InternalKeySkippedCount", rocksdb_internal_key_skipped_count, {} },
{ "InternalDeleteSkippedCount", rocksdb_internal_delete_skipped_count, {} },
{ "InternalRecentSkippedCount", rocksdb_internal_recent_skipped_count, {} },
{ "InternalMergeCount", rocksdb_internal_merge_count, {} },
{ "GetSnapshotTime", rocksdb_get_snapshot_time, {} },
{ "GetFromMemtableTime", rocksdb_get_from_memtable_time, {} },
{ "GetFromMemtableCount", rocksdb_get_from_memtable_count, {} },
{ "GetPostProcessTime", rocksdb_get_post_process_time, {} },
{ "GetFromOutputFilesTime", rocksdb_get_from_output_files_time, {} },
{ "SeekOnMemtableTime", rocksdb_seek_on_memtable_time, {} },
{ "SeekOnMemtableCount", rocksdb_seek_on_memtable_count, {} },
{ "NextOnMemtableCount", rocksdb_next_on_memtable_count, {} },
{ "PrevOnMemtableCount", rocksdb_prev_on_memtable_count, {} },
{ "SeekChildSeekTime", rocksdb_seek_child_seek_time, {} },
{ "SeekChildSeekCount", rocksdb_seek_child_seek_count, {} },
{ "SeekMinHeapTime", rocksdb_seek_min_heap_time, {} },
{ "SeekMaxHeapTime", rocksdb_seek_max_heap_time, {} },
{ "SeekInternalSeekTime", rocksdb_seek_internal_seek_time, {} },
{ "FindNextUserEntryTime", rocksdb_find_next_user_entry_time, {} },
{ "WriteWalTime", rocksdb_write_wal_time, {} },
{ "WriteMemtableTime", rocksdb_write_memtable_time, {} },
{ "WriteDelayTime", rocksdb_write_delay_time, {} },
{ "WritePreAndPostProcessTime", rocksdb_write_pre_and_post_process_time, {} },
{ "DbMutexLockNanos", rocksdb_db_mutex_lock_nanos, {} },
{ "DbConditionWaitNanos", rocksdb_db_condition_wait_nanos, {} },
{ "MergeOperatorTimeNanos", rocksdb_merge_operator_time_nanos, {} },
{ "ReadIndexBlockNanos", rocksdb_read_index_block_nanos, {} },
{ "ReadFilterBlockNanos", rocksdb_read_filter_block_nanos, {} },
{ "NewTableBlockIterNanos", rocksdb_new_table_block_iter_nanos, {} },
{ "NewTableIteratorNanos", rocksdb_new_table_iterator_nanos, {} },
{ "BlockSeekNanos", rocksdb_block_seek_nanos, {} },
{ "FindTableNanos", rocksdb_find_table_nanos, {} },
{ "BloomMemtableHitCount", rocksdb_bloom_memtable_hit_count, {} },
{ "BloomMemtableMissCount", rocksdb_bloom_memtable_miss_count, {} },
{ "BloomSstHitCount", rocksdb_bloom_sst_hit_count, {} },
{ "BloomSstMissCount", rocksdb_bloom_sst_miss_count, {} },
{ "KeyLockWaitTime", rocksdb_key_lock_wait_time, {} },
{ "KeyLockWaitCount", rocksdb_key_lock_wait_count, {} },
{ "EnvNewSequentialFileNanos", rocksdb_env_new_sequential_file_nanos, {} },
{ "EnvNewRandomAccessFileNanos", rocksdb_env_new_random_access_file_nanos, {} },
{ "EnvNewWritableFileNanos", rocksdb_env_new_writable_file_nanos, {} },
{ "EnvReuseWritableFileNanos", rocksdb_env_reuse_writable_file_nanos, {} },
{ "EnvNewRandomRwFileNanos", rocksdb_env_new_random_rw_file_nanos, {} },
{ "EnvNewDirectoryNanos", rocksdb_env_new_directory_nanos, {} },
{ "EnvFileExistsNanos", rocksdb_env_file_exists_nanos, {} },
{ "EnvGetChildrenNanos", rocksdb_env_get_children_nanos, {} },
{ "EnvGetChildrenFileAttributesNanos", rocksdb_env_get_children_file_attributes_nanos, {} },
{ "EnvDeleteFileNanos", rocksdb_env_delete_file_nanos, {} },
{ "EnvCreateDirNanos", rocksdb_env_create_dir_nanos, {} },
{ "EnvCreateDirIfMissingNanos", rocksdb_env_create_dir_if_missing_nanos, {} },
{ "EnvDeleteDirNanos", rocksdb_env_delete_dir_nanos, {} },
{ "EnvGetFileSizeNanos", rocksdb_env_get_file_size_nanos, {} },
{ "EnvGetFileModificationTimeNanos", rocksdb_env_get_file_modification_time_nanos, {} },
{ "EnvRenameFileNanos", rocksdb_env_rename_file_nanos, {} },
{ "EnvLinkFileNanos", rocksdb_env_link_file_nanos, {} },
{ "EnvLockFileNanos", rocksdb_env_lock_file_nanos, {} },
{ "EnvUnlockFileNanos", rocksdb_env_unlock_file_nanos, {} },
{ "EnvNewLoggerNanos", rocksdb_env_new_logger_nanos, {} },
};
for (auto& [name, metric, vals] : metrics) { // readers, then writer
for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
vals.push_back(0); // add reader
}
vals.push_back(0); // add writer
}
}
void PerfContextMetrics::reset() {
rocksdb::get_perf_context()->Reset();
}
void PerfContextMetrics::set(int index) {
for (auto& [name, metric, vals] : metrics) {
vals[index] = getRocksdbPerfcontextMetric(metric);
}
}
void PerfContextMetrics::log(bool ignoreZeroMetric) {
TraceEvent e("RocksDBPerfContextMetrics");
e.setMaxEventLength(20000);
for (auto& [name, metric, vals] : metrics) {
uint64_t s = 0;
for (auto& v : vals) {
s = s + v;
}
if (ignoreZeroMetric && s == 0)
continue;
e.detail("Sum" + (std::string)name, s);
for (int i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; i++) {
if (vals[i] != 0)
e.detail("RD" + std::to_string(i) + name, vals[i]);
}
if (vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM] != 0)
e.detail("WR" + (std::string)name, vals[SERVER_KNOBS->ROCKSDB_READ_PARALLELISM]);
}
}
uint64_t PerfContextMetrics::getRocksdbPerfcontextMetric(int metric) {
switch (metric) {
case rocksdb_user_key_comparison_count:
return rocksdb::get_perf_context()->user_key_comparison_count;
case rocksdb_block_cache_hit_count:
return rocksdb::get_perf_context()->block_cache_hit_count;
case rocksdb_block_read_count:
return rocksdb::get_perf_context()->block_read_count;
case rocksdb_block_read_byte:
return rocksdb::get_perf_context()->block_read_byte;
case rocksdb_block_read_time:
return rocksdb::get_perf_context()->block_read_time;
case rocksdb_block_checksum_time:
return rocksdb::get_perf_context()->block_checksum_time;
case rocksdb_block_decompress_time:
return rocksdb::get_perf_context()->block_decompress_time;
case rocksdb_get_read_bytes:
return rocksdb::get_perf_context()->get_read_bytes;
case rocksdb_multiget_read_bytes:
return rocksdb::get_perf_context()->multiget_read_bytes;
case rocksdb_iter_read_bytes:
return rocksdb::get_perf_context()->iter_read_bytes;
case rocksdb_internal_key_skipped_count:
return rocksdb::get_perf_context()->internal_key_skipped_count;
case rocksdb_internal_delete_skipped_count:
return rocksdb::get_perf_context()->internal_delete_skipped_count;
case rocksdb_internal_recent_skipped_count:
return rocksdb::get_perf_context()->internal_recent_skipped_count;
case rocksdb_internal_merge_count:
return rocksdb::get_perf_context()->internal_merge_count;
case rocksdb_get_snapshot_time:
return rocksdb::get_perf_context()->get_snapshot_time;
case rocksdb_get_from_memtable_time:
return rocksdb::get_perf_context()->get_from_memtable_time;
case rocksdb_get_from_memtable_count:
return rocksdb::get_perf_context()->get_from_memtable_count;
case rocksdb_get_post_process_time:
return rocksdb::get_perf_context()->get_post_process_time;
case rocksdb_get_from_output_files_time:
return rocksdb::get_perf_context()->get_from_output_files_time;
case rocksdb_seek_on_memtable_time:
return rocksdb::get_perf_context()->seek_on_memtable_time;
case rocksdb_seek_on_memtable_count:
return rocksdb::get_perf_context()->seek_on_memtable_count;
case rocksdb_next_on_memtable_count:
return rocksdb::get_perf_context()->next_on_memtable_count;
case rocksdb_prev_on_memtable_count:
return rocksdb::get_perf_context()->prev_on_memtable_count;
case rocksdb_seek_child_seek_time:
return rocksdb::get_perf_context()->seek_child_seek_time;
case rocksdb_seek_child_seek_count:
return rocksdb::get_perf_context()->seek_child_seek_count;
case rocksdb_seek_min_heap_time:
return rocksdb::get_perf_context()->seek_min_heap_time;
case rocksdb_seek_max_heap_time:
return rocksdb::get_perf_context()->seek_max_heap_time;
case rocksdb_seek_internal_seek_time:
return rocksdb::get_perf_context()->seek_internal_seek_time;
case rocksdb_find_next_user_entry_time:
return rocksdb::get_perf_context()->find_next_user_entry_time;
case rocksdb_write_wal_time:
return rocksdb::get_perf_context()->write_wal_time;
case rocksdb_write_memtable_time:
return rocksdb::get_perf_context()->write_memtable_time;
case rocksdb_write_delay_time:
return rocksdb::get_perf_context()->write_delay_time;
case rocksdb_write_pre_and_post_process_time:
return rocksdb::get_perf_context()->write_pre_and_post_process_time;
case rocksdb_db_mutex_lock_nanos:
return rocksdb::get_perf_context()->db_mutex_lock_nanos;
case rocksdb_db_condition_wait_nanos:
return rocksdb::get_perf_context()->db_condition_wait_nanos;
case rocksdb_merge_operator_time_nanos:
return rocksdb::get_perf_context()->merge_operator_time_nanos;
case rocksdb_read_index_block_nanos:
return rocksdb::get_perf_context()->read_index_block_nanos;
case rocksdb_read_filter_block_nanos:
return rocksdb::get_perf_context()->read_filter_block_nanos;
case rocksdb_new_table_block_iter_nanos:
return rocksdb::get_perf_context()->new_table_block_iter_nanos;
case rocksdb_new_table_iterator_nanos:
return rocksdb::get_perf_context()->new_table_iterator_nanos;
case rocksdb_block_seek_nanos:
return rocksdb::get_perf_context()->block_seek_nanos;
case rocksdb_find_table_nanos:
return rocksdb::get_perf_context()->find_table_nanos;
case rocksdb_bloom_memtable_hit_count:
return rocksdb::get_perf_context()->bloom_memtable_hit_count;
case rocksdb_bloom_memtable_miss_count:
return rocksdb::get_perf_context()->bloom_memtable_miss_count;
case rocksdb_bloom_sst_hit_count:
return rocksdb::get_perf_context()->bloom_sst_hit_count;
case rocksdb_bloom_sst_miss_count:
return rocksdb::get_perf_context()->bloom_sst_miss_count;
case rocksdb_key_lock_wait_time:
return rocksdb::get_perf_context()->key_lock_wait_time;
case rocksdb_key_lock_wait_count:
return rocksdb::get_perf_context()->key_lock_wait_count;
case rocksdb_env_new_sequential_file_nanos:
return rocksdb::get_perf_context()->env_new_sequential_file_nanos;
case rocksdb_env_new_random_access_file_nanos:
return rocksdb::get_perf_context()->env_new_random_access_file_nanos;
case rocksdb_env_new_writable_file_nanos:
return rocksdb::get_perf_context()->env_new_writable_file_nanos;
case rocksdb_env_reuse_writable_file_nanos:
return rocksdb::get_perf_context()->env_reuse_writable_file_nanos;
case rocksdb_env_new_random_rw_file_nanos:
return rocksdb::get_perf_context()->env_new_random_rw_file_nanos;
case rocksdb_env_new_directory_nanos:
return rocksdb::get_perf_context()->env_new_directory_nanos;
case rocksdb_env_file_exists_nanos:
return rocksdb::get_perf_context()->env_file_exists_nanos;
case rocksdb_env_get_children_nanos:
return rocksdb::get_perf_context()->env_get_children_nanos;
case rocksdb_env_get_children_file_attributes_nanos:
return rocksdb::get_perf_context()->env_get_children_file_attributes_nanos;
case rocksdb_env_delete_file_nanos:
return rocksdb::get_perf_context()->env_delete_file_nanos;
case rocksdb_env_create_dir_nanos:
return rocksdb::get_perf_context()->env_create_dir_nanos;
case rocksdb_env_create_dir_if_missing_nanos:
return rocksdb::get_perf_context()->env_create_dir_if_missing_nanos;
case rocksdb_env_delete_dir_nanos:
return rocksdb::get_perf_context()->env_delete_dir_nanos;
case rocksdb_env_get_file_size_nanos:
return rocksdb::get_perf_context()->env_get_file_size_nanos;
case rocksdb_env_get_file_modification_time_nanos:
return rocksdb::get_perf_context()->env_get_file_modification_time_nanos;
case rocksdb_env_rename_file_nanos:
return rocksdb::get_perf_context()->env_rename_file_nanos;
case rocksdb_env_link_file_nanos:
return rocksdb::get_perf_context()->env_link_file_nanos;
case rocksdb_env_lock_file_nanos:
return rocksdb::get_perf_context()->env_lock_file_nanos;
case rocksdb_env_unlock_file_nanos:
return rocksdb::get_perf_context()->env_unlock_file_nanos;
case rocksdb_env_new_logger_nanos:
return rocksdb::get_perf_context()->env_new_logger_nanos;
default:
break;
}
return 0;
}
ACTOR Future<Void> refreshReadIteratorPool(std::shared_ptr<ReadIteratorPool> readIterPool) {
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
loop {
@ -336,6 +603,7 @@ ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetc
}
ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> statistics,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
rocksdb::DB* db,
std::shared_ptr<ReadIteratorPool> readIterPool) {
state std::vector<std::tuple<const char*, uint32_t, uint64_t>> tickerStats = {
@ -431,6 +699,10 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
stat = readIterPool->numTimesReadIteratorsReused();
e.detail("NumTimesReadIteratorsReused", stat - readIteratorPoolStats["NumTimesReadIteratorsReused"]);
readIteratorPoolStats["NumTimesReadIteratorsReused"] = stat;
if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
perfContextMetrics->log(true);
}
}
}
@ -458,6 +730,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
struct Writer : IThreadPoolReceiver {
DB& db;
UID id;
std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
Reference<Histogram> commitLatencyHistogram;
@ -466,9 +739,16 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Reference<Histogram> writeHistogram;
Reference<Histogram> deleteCompactRangeHistogram;
std::shared_ptr<ReadIteratorPool> readIterPool;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
int threadIndex;
explicit Writer(DB& db, UID id, std::shared_ptr<ReadIteratorPool> readIterPool)
: db(db), id(id), readIterPool(readIterPool),
explicit Writer(DB& db,
UID id,
std::shared_ptr<ReadIteratorPool> readIterPool,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
int threadIndex)
: db(db), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
threadIndex(threadIndex),
rateLimiter(SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0
? rocksdb::NewGenericRateLimiter(
SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, // rate_bytes_per_sec
@ -491,7 +771,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Histogram::Unit::microseconds)),
deleteCompactRangeHistogram(Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP,
ROCKSDB_DELETE_COMPACTRANGE_HISTOGRAM,
Histogram::Unit::microseconds)) {}
Histogram::Unit::microseconds)) {
if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
// Enable perf context on the same thread with the db thread
rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
perfContextMetrics->reset();
}
}
~Writer() override {
if (db) {
@ -542,11 +828,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
// The current thread and main thread are same when the code runs in simulation.
// blockUntilReady() is getting the thread into deadlock state, so directly calling
// the metricsLogger.
a.metrics = rocksDBMetricLogger(options.statistics, db, readIterPool) &&
a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
} else {
onMainThread([&] {
a.metrics = rocksDBMetricLogger(options.statistics, db, readIterPool) &&
a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
return Future<bool>(true);
}).blockUntilReady();
@ -586,6 +872,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
};
void action(CommitAction& a) {
bool doPerfContextMetrics =
SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
if (doPerfContextMetrics) {
perfContextMetrics->reset();
}
double commitBeginTime;
if (a.getHistograms) {
commitBeginTime = timer_monotonic();
@ -632,6 +924,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
commitActionHistogram->sampleSeconds(currTime - commitBeginTime);
commitLatencyHistogram->sampleSeconds(currTime - a.startTime);
}
if (doPerfContextMetrics) {
perfContextMetrics->set(threadIndex);
}
}
struct CloseAction : TypedAction<Writer, CloseAction> {
@ -684,9 +979,14 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Reference<Histogram> readValueGetHistogram;
Reference<Histogram> readPrefixGetHistogram;
std::shared_ptr<ReadIteratorPool> readIterPool;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
int threadIndex;
explicit Reader(DB& db, std::shared_ptr<ReadIteratorPool> readIterPool)
: db(db), readIterPool(readIterPool),
explicit Reader(DB& db,
std::shared_ptr<ReadIteratorPool> readIterPool,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
int threadIndex)
: db(db), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), threadIndex(threadIndex),
readRangeLatencyHistogram(Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP,
ROCKSDB_READRANGE_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
@ -734,6 +1034,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
readValuePrefixTimeout = SERVER_KNOBS->ROCKSDB_READ_VALUE_PREFIX_TIMEOUT;
readRangeTimeout = SERVER_KNOBS->ROCKSDB_READ_RANGE_TIMEOUT;
}
if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) {
// Enable perf context on the same thread with the db thread
rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
perfContextMetrics->reset();
}
}
void init() override {}
@ -752,6 +1057,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
};
void action(ReadValueAction& a) {
bool doPerfContextMetrics =
SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
if (doPerfContextMetrics) {
perfContextMetrics->reset();
}
double readBeginTime = timer_monotonic();
if (a.getHistograms) {
readValueQueueWaitHistogram->sampleSeconds(readBeginTime - a.startTime);
@ -801,6 +1112,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
readValueActionHistogram->sampleSeconds(currTime - readBeginTime);
readValueLatencyHistogram->sampleSeconds(currTime - a.startTime);
}
if (doPerfContextMetrics) {
perfContextMetrics->set(threadIndex);
}
}
struct ReadValuePrefixAction : TypedAction<Reader, ReadValuePrefixAction> {
@ -818,6 +1132,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
};
void action(ReadValuePrefixAction& a) {
bool doPerfContextMetrics =
SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
if (doPerfContextMetrics) {
perfContextMetrics->reset();
}
double readBeginTime = timer_monotonic();
if (a.getHistograms) {
readPrefixQueueWaitHistogram->sampleSeconds(readBeginTime - a.startTime);
@ -871,6 +1191,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
readPrefixActionHistogram->sampleSeconds(currTime - readBeginTime);
readPrefixLatencyHistogram->sampleSeconds(currTime - a.startTime);
}
if (doPerfContextMetrics) {
perfContextMetrics->set(threadIndex);
}
}
struct ReadRangeAction : TypedAction<Reader, ReadRangeAction>, FastAllocated<ReadRangeAction> {
@ -887,6 +1210,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
};
void action(ReadRangeAction& a) {
bool doPerfContextMetrics =
SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
if (doPerfContextMetrics) {
perfContextMetrics->reset();
}
double readBeginTime = timer_monotonic();
if (a.getHistograms) {
readRangeQueueWaitHistogram->sampleSeconds(readBeginTime - a.startTime);
@ -983,10 +1312,14 @@ struct RocksDBKeyValueStore : IKeyValueStore {
readRangeActionHistogram->sampleSeconds(currTime - readBeginTime);
readRangeLatencyHistogram->sampleSeconds(currTime - a.startTime);
}
if (doPerfContextMetrics) {
perfContextMetrics->set(threadIndex);
}
}
};
DB db = nullptr;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
std::string path;
UID id;
Reference<IThreadPool> writeThread;
@ -1015,7 +1348,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Counters counters;
explicit RocksDBKeyValueStore(const std::string& path, UID id)
: path(path), id(id), readIterPool(new ReadIteratorPool(db, path)),
: path(path), id(id), perfContextMetrics(new PerfContextMetrics()), readIterPool(new ReadIteratorPool(db, path)),
readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
@ -1038,10 +1371,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
writeThread = createGenericThreadPool();
readThreads = createGenericThreadPool();
}
writeThread->addThread(new Writer(db, id, readIterPool), "fdb-rocksdb-wr");
writeThread->addThread(
new Writer(db, id, readIterPool, perfContextMetrics, SERVER_KNOBS->ROCKSDB_READ_PARALLELISM),
"fdb-rocksdb-wr");
TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) {
readThreads->addThread(new Reader(db, readIterPool), "fdb-rocksdb-re");
readThreads->addThread(new Reader(db, readIterPool, perfContextMetrics, i), "fdb-rocksdb-re");
}
}

View File

@ -19,6 +19,7 @@
*/
#define SQLITE_THREADSAFE 0 // also in sqlite3.amalgamation.c!
#include "contrib/fmt-8.0.1/include/fmt/format.h"
#include "flow/crc32c.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/CoroFlow.h"
@ -2061,8 +2062,8 @@ private:
}
} catch (Error& e) {
TraceEvent(SevError, "KVDoCloseError", self->logID)
.errorUnsuppressed(e)
.detail("Filename", self->filename)
.error(e, true)
.detail("Reason", e.code() == error_code_platform_error ? "could not delete database" : "unknown");
error = e;
}
@ -2359,7 +2360,7 @@ ACTOR Future<Void> KVFileDump(std::string filename) {
k = keyAfter(kv[kv.size() - 1].key);
}
fflush(stdout);
fprintf(stderr, "Counted: %ld\n", count);
fmt::print(stderr, "Counted: {}\n", count);
if (store->getError().isError())
wait(store->getError());

View File

@ -515,6 +515,8 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask()));
}
state double startTime = now();
Version poppedVer = poppedVersion(self, reqTag);
if (poppedVer > reqBegin || reqBegin < self->startVersion) {
@ -535,8 +537,33 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
return Void();
}
Version endVersion = self->version.get() + 1;
peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion);
state Version endVersion;
// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
loop {
endVersion = self->version.get() + 1;
peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion);
// Reply the peek request when
// - Have data return to the caller, or
// - Batching empty peek is disabled, or
// - Batching empty peek interval has been reached.
if (messages.getLength() > 0 || !SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG ||
now() - startTime > SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL) {
break;
}
state Version waitUntilVersion = self->version.get() + 1;
// Currently, from `reqBegin` to self->version are all empty peeks. Wait for more version, or the empty batching
// interval has expired.
wait(self->version.whenAtLeast(waitUntilVersion) ||
delay(SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL - (now() - startTime)));
if (self->version.get() < waitUntilVersion) {
break; // We know that from `reqBegin` to self->version are all empty messages. Skip re-executing the peek
// logic.
}
}
TLogPeekReply reply;
reply.maxKnownVersion = self->version.get();
@ -600,8 +627,8 @@ ACTOR Future<Void> logRouterPeekStream(LogRouterData* self, TLogPeekStreamReques
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", self->dbgid)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
.errorUnsuppressed(e)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
@ -737,7 +764,7 @@ ACTOR Future<Void> logRouter(TLogInterface interf,
}
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed) {
TraceEvent("LogRouterTerminated", interf.id()).error(e, true);
TraceEvent("LogRouterTerminated", interf.id()).errorUnsuppressed(e);
return Void();
}
throw;

View File

@ -367,7 +367,7 @@ ACTOR Future<Void> serverPeekStreamGetMore(ILogSystem::ServerPeekCursor* self, T
}
}
} catch (Error& e) {
DisabledTraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).error(e, true);
DisabledTraceEvent(SevDebug, "SPC_GetMoreB_Error", self->randomID).errorUnsuppressed(e);
if (e.code() == error_code_connection_failed || e.code() == error_code_operation_obsolete) {
// NOTE: delay in order to avoid the endless retry loop block other tasks
self->peekReplyStream.reset();

View File

@ -558,7 +558,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
.detail("Shards", shards)
.detail("MaxRetries", maxRetries);
} catch (Error& e) {
TraceEvent(SevDebug, interval.end(), relocationIntervalId).error(e, true);
TraceEvent(SevDebug, interval.end(), relocationIntervalId).errorUnsuppressed(e);
throw;
}
@ -992,7 +992,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
TraceEvent(SevDebug, interval.end(), relocationIntervalId);
} catch (Error& e) {
TraceEvent(SevDebug, interval.end(), relocationIntervalId).error(e, true);
TraceEvent(SevDebug, interval.end(), relocationIntervalId).errorUnsuppressed(e);
throw;
}
return Void();
@ -1151,7 +1151,7 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
tr->addReadConflictRange(conflictRange);
tr->addWriteConflictRange(conflictRange);
StorageMetadataType metadata(timer_int());
StorageMetadataType metadata(StorageMetadataType::currentTime());
metadataMap.set(tr, server.id(), metadata);
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
@ -1521,7 +1521,7 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, std::vector<Storag
tr.read_conflict_ranges.push_back_deep(arena, allKeys);
KeyBackedObjectMap<UID, StorageMetadataType, decltype(IncludeVersion())> metadataMap(serverMetadataKeys.begin,
IncludeVersion());
StorageMetadataType metadata(timer_int());
StorageMetadataType metadata(StorageMetadataType::currentTime());
for (auto& s : servers) {
tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));

View File

@ -1161,8 +1161,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
.errorUnsuppressed(e)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
@ -1646,7 +1646,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
wait(error);
throw internal_error();
} catch (Error& e) {
TraceEvent("TLogError", tlogId).error(e, true);
TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
for (auto& it : self.id_data) {
if (it.second->recoverySuccessful.canBeSet()) {

View File

@ -1479,8 +1479,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
.errorUnsuppressed(e)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
@ -1912,7 +1912,7 @@ ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Refer
}
snapReq.reply.send(Void());
} catch (Error& e) {
TraceEvent("TLogSnapCreateError").error(e, true /*includeCancelled */);
TraceEvent("TLogSnapCreateError").errorUnsuppressed(e);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
@ -2555,7 +2555,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p
if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
e.code() == error_code_file_not_found) {
TraceEvent("TLogTerminated", self->dbgid).error(e, true);
TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
return true;
} else
return false;
@ -2848,7 +2848,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
}
} catch (Error& e) {
self.terminated.send(Void());
TraceEvent("TLogError", tlogId).error(e, true);
TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
if (recovered.canBeSet())
recovered.send(Void());

View File

@ -1908,8 +1908,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
.errorUnsuppressed(e)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
@ -2357,7 +2357,7 @@ ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Refer
}
snapReq.reply.send(Void());
} catch (Error& e) {
TraceEvent("TLogExecHelperError").error(e, true /*includeCancelled */);
TraceEvent("TLogExecHelperError").errorUnsuppressed(e);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
@ -3038,7 +3038,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p
if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
e.code() == error_code_file_not_found) {
TraceEvent("TLogTerminated", self->dbgid).error(e, true);
TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
return true;
} else
return false;
@ -3336,7 +3336,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
}
} catch (Error& e) {
self.terminated.send(Void());
TraceEvent("TLogError", tlogId).error(e, true);
TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
if (recovered.canBeSet())
recovered.send(Void());

View File

@ -113,7 +113,7 @@ struct ProxyStats {
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
commitLatencyBands("CommitLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
commitLatencyBands("CommitLatencyBands", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY),
commitBatchingEmptyMessageRatio("CommitBatchingEmptyMessageRatio",
id,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,

View File

@ -158,8 +158,9 @@ ACTOR Future<std::vector<WorkerInterface>> getCoordWorkers(Database cx,
if (!coordinators.present()) {
throw operation_failed();
}
std::vector<NetworkAddress> coordinatorsAddr =
ClusterConnectionString(coordinators.get().toString()).coordinators();
state ClusterConnectionString ccs(coordinators.get().toString());
wait(ccs.resolveHostnames());
std::vector<NetworkAddress> coordinatorsAddr = ccs.coordinators();
std::set<NetworkAddress> coordinatorsAddrSet;
for (const auto& addr : coordinatorsAddr) {
TraceEvent(SevDebug, "CoordinatorAddress").detail("Addr", addr);
@ -731,7 +732,7 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
}
}
} catch (Error& e) {
TraceEvent(("QuietDatabase" + phase + "Error").c_str()).error(e, true);
TraceEvent(("QuietDatabase" + phase + "Error").c_str()).errorUnsuppressed(e);
if (e.code() != error_code_actor_cancelled && e.code() != error_code_attribute_not_found &&
e.code() != error_code_timed_out)
TraceEvent(("QuietDatabase" + phase + "Error").c_str()).error(e);

File diff suppressed because it is too large Load Diff

207
fdbserver/Ratekeeper.h Normal file
View File

@ -0,0 +1,207 @@
/*
* Ratekeeper.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbrpc/Smoother.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/TLogInterface.h"
enum limitReason_t {
unlimited, // TODO: rename to workload?
storage_server_write_queue_size, // 1
storage_server_write_bandwidth_mvcc,
storage_server_readable_behind,
log_server_mvcc_write_bandwidth,
log_server_write_queue, // 5
storage_server_min_free_space, // a storage server's normal limits are being reduced by low free space
storage_server_min_free_space_ratio, // a storage server's normal limits are being reduced by a low free space ratio
log_server_min_free_space,
log_server_min_free_space_ratio,
storage_server_durability_lag, // 10
storage_server_list_fetch_failed,
limitReason_t_end
};
struct StorageQueueInfo {
bool valid;
UID id;
LocalityData locality;
StorageQueuingMetricsReply lastReply;
StorageQueuingMetricsReply prevReply;
Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
Smoother smoothDurableVersion, smoothLatestVersion;
Smoother smoothFreeSpace;
Smoother smoothTotalSpace;
limitReason_t limitReason;
Optional<TransactionTag> busiestReadTag, busiestWriteTag;
double busiestReadTagFractionalBusyness = 0, busiestWriteTagFractionalBusyness = 0;
double busiestReadTagRate = 0, busiestWriteTagRate = 0;
Reference<EventCacheHolder> busiestWriteTagEventHolder;
// refresh periodically
TransactionTagMap<TransactionCommitCostEstimation> tagCostEst;
uint64_t totalWriteCosts = 0;
int totalWriteOps = 0;
StorageQueueInfo(UID id, LocalityData locality)
: valid(false), id(id), locality(locality), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
smoothDurableVersion(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothLatestVersion(SERVER_KNOBS->SMOOTHING_AMOUNT),
smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT),
limitReason(limitReason_t::unlimited),
busiestWriteTagEventHolder(makeReference<EventCacheHolder>(id.toString() + "/BusiestWriteTag")) {
// FIXME: this is a tacky workaround for a potential uninitialized use in trackStorageServerQueueInfo
lastReply.instanceID = -1;
}
};
struct TLogQueueInfo {
bool valid;
UID id;
TLogQueuingMetricsReply lastReply;
TLogQueuingMetricsReply prevReply;
Smoother smoothDurableBytes, smoothInputBytes, verySmoothDurableBytes;
Smoother smoothFreeSpace;
Smoother smoothTotalSpace;
TLogQueueInfo(UID id)
: valid(false), id(id), smoothDurableBytes(SERVER_KNOBS->SMOOTHING_AMOUNT),
smoothInputBytes(SERVER_KNOBS->SMOOTHING_AMOUNT), verySmoothDurableBytes(SERVER_KNOBS->SLOW_SMOOTHING_AMOUNT),
smoothFreeSpace(SERVER_KNOBS->SMOOTHING_AMOUNT), smoothTotalSpace(SERVER_KNOBS->SMOOTHING_AMOUNT) {
// FIXME: this is a tacky workaround for a potential uninitialized use in trackTLogQueueInfo (copied from
// storageQueueInfO)
lastReply.instanceID = -1;
}
};
struct RatekeeperLimits {
double tpsLimit;
Int64MetricHandle tpsLimitMetric;
Int64MetricHandle reasonMetric;
int64_t storageTargetBytes;
int64_t storageSpringBytes;
int64_t logTargetBytes;
int64_t logSpringBytes;
double maxVersionDifference;
int64_t durabilityLagTargetVersions;
int64_t lastDurabilityLag;
double durabilityLagLimit;
TransactionPriority priority;
std::string context;
Reference<EventCacheHolder> rkUpdateEventCacheHolder;
RatekeeperLimits(TransactionPriority priority,
std::string context,
int64_t storageTargetBytes,
int64_t storageSpringBytes,
int64_t logTargetBytes,
int64_t logSpringBytes,
double maxVersionDifference,
int64_t durabilityLagTargetVersions)
: tpsLimit(std::numeric_limits<double>::infinity()), tpsLimitMetric(StringRef("Ratekeeper.TPSLimit" + context)),
reasonMetric(StringRef("Ratekeeper.Reason" + context)), storageTargetBytes(storageTargetBytes),
storageSpringBytes(storageSpringBytes), logTargetBytes(logTargetBytes), logSpringBytes(logSpringBytes),
maxVersionDifference(maxVersionDifference),
durabilityLagTargetVersions(
durabilityLagTargetVersions +
SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS), // The read transaction life versions are expected to not
// be durable on the storage servers
lastDurabilityLag(0), durabilityLagLimit(std::numeric_limits<double>::infinity()), priority(priority),
context(context), rkUpdateEventCacheHolder(makeReference<EventCacheHolder>("RkUpdate" + context)) {}
};
class Ratekeeper {
friend class RatekeeperImpl;
// Differentiate from GrvProxyInfo in DatabaseContext.h
struct GrvProxyInfo {
int64_t totalTransactions;
int64_t batchTransactions;
uint64_t lastThrottledTagChangeId;
double lastUpdateTime;
double lastTagPushTime;
GrvProxyInfo()
: totalTransactions(0), batchTransactions(0), lastThrottledTagChangeId(0), lastUpdateTime(0),
lastTagPushTime(0) {}
};
UID id;
Database db;
Map<UID, StorageQueueInfo> storageQueueInfo;
Map<UID, TLogQueueInfo> tlogQueueInfo;
std::map<UID, Ratekeeper::GrvProxyInfo> grvProxyInfo;
Smoother smoothReleasedTransactions, smoothBatchReleasedTransactions, smoothTotalDurableBytes;
HealthMetrics healthMetrics;
DatabaseConfiguration configuration;
PromiseStream<Future<Void>> addActor;
Int64MetricHandle actualTpsMetric;
double lastWarning;
double lastSSListFetchedTimestamp;
std::unique_ptr<class TagThrottler> tagThrottler;
RatekeeperLimits normalLimits;
RatekeeperLimits batchLimits;
Deque<double> actualTpsHistory;
Optional<Key> remoteDC;
Future<Void> expiredTagThrottleCleanup;
double lastBusiestCommitTagPick;
Ratekeeper(UID id, Database db);
Future<Void> configurationMonitor();
void updateCommitCostEstimation(UIDTransactionTagMap<TransactionCommitCostEstimation> const& costEstimation);
void updateRate(RatekeeperLimits* limits);
Future<Void> refreshStorageServerCommitCost();
Future<Void> monitorServerListChange(PromiseStream<std::pair<UID, Optional<StorageServerInterface>>> serverChanges);
Future<Void> trackEachStorageServer(FutureStream<std::pair<UID, Optional<StorageServerInterface>>> serverChanges);
// SOMEDAY: template trackStorageServerQueueInfo and trackTLogQueueInfo into one function
Future<Void> trackStorageServerQueueInfo(StorageServerInterface);
Future<Void> trackTLogQueueInfo(TLogInterface);
void tryAutoThrottleTag(TransactionTag, double rate, double busyness, TagThrottledReason);
void tryAutoThrottleTag(StorageQueueInfo&, int64_t storageQueue, int64_t storageDurabilityLag);
Future<Void> monitorThrottlingChanges();
public:
static Future<Void> run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
};

View File

@ -373,7 +373,7 @@ ACTOR Future<Void> resolver(ResolverInterface resolver,
}
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed) {
TraceEvent("ResolverTerminated", resolver.id()).error(e, true);
TraceEvent("ResolverTerminated", resolver.id()).errorUnsuppressed(e);
return Void();
}
throw;

View File

@ -98,8 +98,8 @@ ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int
} catch (Error& e) {
bool isError = e.code() != error_code_operation_cancelled;
TraceEvent(isError ? SevError : SevWarnAlways, "FastRestoreApplierError", self->id())
.detail("RequestType", requestTypeStr)
.error(e, true);
.errorUnsuppressed(e)
.detail("RequestType", requestTypeStr);
actors.clear(false);
break;
}
@ -251,9 +251,9 @@ ACTOR static Future<Void> applyClearRangeMutations(Standalone<VectorRef<KeyRange
retries++;
if (retries > SERVER_KNOBS->FASTRESTORE_TXN_RETRY_MAX) {
TraceEvent(SevWarnAlways, "RestoreApplierApplyClearRangeMutationsStuck", applierID)
.error(e)
.detail("BatchIndex", batchIndex)
.detail("ClearRanges", ranges.size())
.error(e);
.detail("ClearRanges", ranges.size());
}
wait(tr->onError(e));
}
@ -314,11 +314,13 @@ ACTOR static Future<Void> getAndComputeStagingKeys(
} catch (Error& e) {
cc->fetchTxnRetries += 1;
if (retries++ > incompleteStagingKeys.size()) {
TraceEvent(SevWarnAlways, "GetAndComputeStagingKeys", applierID)
.suppressFor(1.0)
.detail("RandomUID", randomID)
.detail("BatchIndex", batchIndex)
.error(e);
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevWarnAlways, "GetAndComputeStagingKeys", applierID)
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("RandomUID", randomID)
.detail("BatchIndex", batchIndex);
}
}
wait(tr->onError(e));
}

View File

@ -136,7 +136,7 @@ ACTOR Future<Void> startRestoreController(Reference<RestoreWorkerData> controlle
wait(startProcessRestoreRequests(self, cx) || error);
} catch (Error& e) {
if (e.code() != error_code_operation_cancelled) {
TraceEvent(SevError, "FastRestoreControllerStart").detail("Reason", "Unexpected unhandled error").error(e);
TraceEvent(SevError, "FastRestoreControllerStart").error(e).detail("Reason", "Unexpected unhandled error");
}
}

View File

@ -224,7 +224,7 @@ ACTOR Future<Void> dispatchRequests(Reference<RestoreLoaderData> self) {
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "FastRestoreLoaderDispatchRequests").error(e, true);
TraceEvent(SevError, "FastRestoreLoaderDispatchRequests").errorUnsuppressed(e);
throw e;
}
}
@ -301,8 +301,8 @@ ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf,
} catch (Error& e) {
bool isError = e.code() != error_code_operation_cancelled; // == error_code_broken_promise
TraceEvent(isError ? SevError : SevWarnAlways, "FastRestoreLoaderError", self->id())
.detail("RequestType", requestTypeStr)
.error(e, true);
.errorUnsuppressed(e)
.detail("RequestType", requestTypeStr);
actors.clear(false);
break;
}
@ -513,8 +513,8 @@ ACTOR static Future<Void> parsePartitionedLogFileOnLoader(
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "FastRestoreDecodedPartitionedLogFileConnectionFailure")
.detail("Retries", ++readFileRetries)
.error(e);
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "FastRestoreParsePartitionedLogFileOnLoaderUnexpectedError").error(e);
@ -659,10 +659,10 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
} catch (Error& e) { // In case ci.samples throws broken_promise due to unstable network
if (e.code() == error_code_broken_promise || e.code() == error_code_operation_cancelled) {
TraceEvent(SevWarnAlways, "FastRestoreLoaderPhaseLoadFileSendSamples")
.detail("SamplesMessages", samplesMessages)
.error(e, true);
.errorUnsuppressed(e)
.detail("SamplesMessages", samplesMessages);
} else {
TraceEvent(SevError, "FastRestoreLoaderPhaseLoadFileSendSamplesUnexpectedError").error(e, true);
TraceEvent(SevError, "FastRestoreLoaderPhaseLoadFileSendSamplesUnexpectedError").errorUnsuppressed(e);
}
}
@ -1230,8 +1230,8 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "FastRestoreDecodedRangeFileConnectionFailure")
.detail("Retries", ++readFileRetries)
.error(e);
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "FastRestoreParseRangeFileOnLoaderUnexpectedError").error(e);
@ -1355,8 +1355,8 @@ ACTOR static Future<Void> parseLogFileToMutationsOnLoader(NotifiedVersion* pProc
e.code() == error_code_timed_out || e.code() == error_code_lookup_failed) {
// blob http request failure, retry
TraceEvent(SevWarnAlways, "FastRestoreDecodedLogFileConnectionFailure")
.detail("Retries", ++readFileRetries)
.error(e);
.error(e)
.detail("Retries", ++readFileRetries);
wait(delayJittered(0.1));
} else {
TraceEvent(SevError, "FastRestoreParseLogFileToMutationsOnLoaderUnexpectedError").error(e);

View File

@ -264,7 +264,7 @@ ACTOR Future<Void> startRestoreWorker(Reference<RestoreWorkerData> self, Restore
}
}
} catch (Error& e) {
TraceEvent(SevWarn, "FastRestoreWorkerError").detail("RequestType", requestTypeStr).error(e, true);
TraceEvent(SevWarn, "FastRestoreWorkerError").errorUnsuppressed(e).detail("RequestType", requestTypeStr);
break;
}
}

View File

@ -601,7 +601,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
? SevInfo
: SevError,
"SimulatedFDBDTerminated")
.error(e, true)
.errorUnsuppressed(e)
.detail("ZoneId", localities.zoneId());
}
@ -617,7 +617,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
onShutdown = ISimulator::InjectFaults;
} catch (Error& e) {
TraceEvent(destructed ? SevInfo : SevError, "SimulatedFDBDRebooterError")
.error(e, true)
.errorUnsuppressed(e)
.detail("ZoneId", localities.zoneId())
.detail("RandomId", randomId);
onShutdown = e;
@ -1905,8 +1905,8 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
TEST(useIPv6); // Use IPv6
TEST(!useIPv6); // Use IPv4
// TODO(renxuan): Use hostname 25% of the time, unless it is disabled
bool useHostname = false; // !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
// Use hostname 25% of the time, unless it is disabled
bool useHostname = !testConfig.disableHostname && deterministicRandom()->random01() < 0.25;
TEST(useHostname); // Use hostname
TEST(!useHostname); // Use IP address
NetworkAddressFromHostname fromHostname =

View File

@ -1038,7 +1038,7 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
if (ssLag[address] >= 60) {
messages.push_back(JsonString::makeMessage(
"storage_server_lagging",
format("Storage server lagging by %ld seconds.", (int64_t)ssLag[address]).c_str()));
format("Storage server lagging by %lld seconds.", (int64_t)ssLag[address]).c_str()));
}
// Store the message array into the status object that represents the worker process

View File

@ -1375,7 +1375,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
break;
} catch (Error& e) {
TraceEvent("SCFKBlockFail", data->thisServerID)
.error(e, true)
.errorUnsuppressed(e)
.suppressFor(1.0)
.detail("FKID", interval.pairID);
if (e.code() == error_code_transaction_too_old) {
@ -1507,7 +1507,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
// TraceEvent(SevDebug, interval.end(), data->thisServerID);
} catch (Error& e) {
// TraceEvent(SevDebug, interval.end(), data->thisServerID).error(e, true).detail("Version", data->version.get());
// TraceEvent(SevDebug, interval.end(), data->thisServerID).errorUnsuppressed(e).detail("Version", data->version.get());
// TODO define the shuttingDown state of cache server
if (e.code() == error_code_actor_cancelled &&

View File

@ -137,6 +137,23 @@ TCServerInfo::TCServerInfo(StorageServerInterface ssi,
}
}
bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const {
ASSERT(serverMetricsPresent());
auto& metrics = getServerMetrics();
ASSERT(metrics.available.bytes >= 0);
ASSERT(metrics.capacity.bytes >= 0);
double availableSpaceRatio;
if (metrics.capacity.bytes == 0) {
availableSpaceRatio = 0;
} else {
availableSpaceRatio = (((double)metrics.available.bytes) / metrics.capacity.bytes);
}
return availableSpaceRatio >= minAvailableSpaceRatio;
}
Future<Void> TCServerInfo::updateServerMetrics() {
return TCServerInfoImpl::updateServerMetrics(this);
}
@ -396,8 +413,23 @@ double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const {
return minRatio;
}
bool TCTeamInfo::allServersHaveHealthyAvailableSpace() const {
bool result = true;
double minAvailableSpaceRatio =
SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO + SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER;
for (const auto& server : servers) {
if (!server->serverMetricsPresent() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) {
result = false;
break;
}
}
return result;
}
bool TCTeamInfo::hasHealthyAvailableSpace(double minRatio) const {
return getMinAvailableSpaceRatio() >= minRatio && getMinAvailableSpace() > SERVER_KNOBS->MIN_AVAILABLE_SPACE;
return getMinAvailableSpaceRatio() >= minRatio && getMinAvailableSpace() > SERVER_KNOBS->MIN_AVAILABLE_SPACE &&
allServersHaveHealthyAvailableSpace();
}
bool TCTeamInfo::isOptimal() const {

View File

@ -93,6 +93,8 @@ public:
return (storeType == configStoreType || storeType == KeyValueStoreType::END);
}
bool hasHealthyAvailableSpace(double minAvailableSpaceRatio) const;
Future<Void> updateServerMetrics();
static Future<Void> updateServerMetrics(Reference<TCServerInfo> server);
Future<Void> serverMetricsPolling();
@ -220,4 +222,6 @@ private:
// Calculate an "average" of the metrics replies that we received. Penalize teams from which we did not receive all
// replies.
int64_t getLoadAverage() const;
bool allServersHaveHealthyAvailableSpace() const;
};

View File

@ -1740,140 +1740,168 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
return Void();
}
state Version endVersion = logData->version.get() + 1;
state bool onlySpilled = false;
state Version endVersion;
state bool onlySpilled;
// grab messages from disk
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
if (reqBegin <= logData->persistentDataDurableVersion) {
// Just in case the durable version changes while we are waiting for the read, we grab this data from memory. We
// may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: Only do this if
// an initial attempt to read from disk results in insufficient data and the required data is no longer in
// memory SOMEDAY: Should we only send part of the messages we collected, to actually limit the size of the
// result?
// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
loop {
endVersion = logData->version.get() + 1;
onlySpilled = false;
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
}
// grab messages from disk
//TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2);
if (reqBegin <= logData->persistentDataDurableVersion) {
// Just in case the durable version changes while we are waiting for the read, we grab this data from
// memory. We may or may not actually send it depending on whether we get enough data from disk. SOMEDAY:
// Only do this if an initial attempt to read from disk results in insufficient data and the required data
// is no longer in memory SOMEDAY: Should we only send part of the messages we collected, to actually limit
// the size of the result?
if (logData->shouldSpillByValue(reqTag)) {
RangeResult kvs = wait(self->persistentData->readRange(
KeyRangeRef(persistTagMessagesKey(logData->logId, reqTag, reqBegin),
persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->DESIRED_TOTAL_BYTES,
SERVER_KNOBS->DESIRED_TOTAL_BYTES));
for (auto& kv : kvs) {
auto ver = decodeTagMessagesKey(kv.key);
messages << VERSION_HEADER << ver;
messages.serializeBytes(kv.value);
}
if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1;
onlySpilled = true;
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
messages.serializeBytes(messages2.toValue());
peekMessagesFromMemory(logData, reqTag, reqBegin, messages2, endVersion);
}
} else {
// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
RangeResult kvrefs = wait(self->persistentData->readRange(
KeyRangeRef(
persistTagMessageRefsKey(logData->logId, reqTag, reqBegin),
persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1));
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
if (logData->shouldSpillByValue(reqTag)) {
RangeResult kvs = wait(self->persistentData->readRange(
KeyRangeRef(
persistTagMessagesKey(logData->logId, reqTag, reqBegin),
persistTagMessagesKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->DESIRED_TOTAL_BYTES,
SERVER_KNOBS->DESIRED_TOTAL_BYTES));
state std::vector<std::pair<IDiskQueue::location, IDiskQueue::location>> commitLocations;
state bool earlyEnd = false;
uint32_t mutationBytes = 0;
state uint64_t commitBytes = 0;
state Version firstVersion = std::numeric_limits<Version>::max();
for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) {
auto& kv = kvrefs[i];
VectorRef<SpilledData> spilledData;
BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion));
r >> spilledData;
for (const SpilledData& sd : spilledData) {
if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
earlyEnd = true;
for (auto& kv : kvs) {
auto ver = decodeTagMessagesKey(kv.key);
messages << VERSION_HEADER << ver;
messages.serializeBytes(kv.value);
}
if (kvs.expectedSize() >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
endVersion = decodeTagMessagesKey(kvs.end()[-1].key) + 1;
onlySpilled = true;
} else {
messages.serializeBytes(messages2.toValue());
}
} else {
// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
RangeResult kvrefs = wait(self->persistentData->readRange(
KeyRangeRef(
persistTagMessageRefsKey(logData->logId, reqTag, reqBegin),
persistTagMessageRefsKey(logData->logId, reqTag, logData->persistentDataDurableVersion + 1)),
SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1));
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("Tag1Results", s1).detail("Tag2Results", s2).detail("Tag1ResultsLim", kv1.size()).detail("Tag2ResultsLim", kv2.size()).detail("Tag1ResultsLast", kv1.size() ? kv1[0].key : "").detail("Tag2ResultsLast", kv2.size() ? kv2[0].key : "").detail("Limited", limited).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowEpoch", self->epoch()).detail("NowSeq", self->sequence.getNextSequence());
state std::vector<std::pair<IDiskQueue::location, IDiskQueue::location>> commitLocations;
state bool earlyEnd = false;
uint32_t mutationBytes = 0;
state uint64_t commitBytes = 0;
state Version firstVersion = std::numeric_limits<Version>::max();
for (int i = 0; i < kvrefs.size() && i < SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK; i++) {
auto& kv = kvrefs[i];
VectorRef<SpilledData> spilledData;
BinaryReader r(kv.value, AssumeVersion(logData->protocolVersion));
r >> spilledData;
for (const SpilledData& sd : spilledData) {
if (mutationBytes >= SERVER_KNOBS->DESIRED_TOTAL_BYTES) {
earlyEnd = true;
break;
}
if (sd.version >= reqBegin) {
firstVersion = std::min(firstVersion, sd.version);
const IDiskQueue::location end = sd.start.lo + sd.length;
commitLocations.emplace_back(sd.start, end);
// This isn't perfect, because we aren't accounting for page boundaries, but should be
// close enough.
commitBytes += sd.length;
mutationBytes += sd.mutationBytes;
}
}
if (earlyEnd)
break;
}
if (sd.version >= reqBegin) {
firstVersion = std::min(firstVersion, sd.version);
const IDiskQueue::location end = sd.start.lo + sd.length;
commitLocations.emplace_back(sd.start, end);
// This isn't perfect, because we aren't accounting for page boundaries, but should be
// close enough.
commitBytes += sd.length;
mutationBytes += sd.mutationBytes;
}
}
if (earlyEnd)
break;
}
earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1);
wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes));
state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes);
state std::vector<Future<Standalone<StringRef>>> messageReads;
messageReads.reserve(commitLocations.size());
for (const auto& pair : commitLocations) {
messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True));
}
commitLocations.clear();
wait(waitForAll(messageReads));
earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK + 1);
wait(self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes));
state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes);
state std::vector<Future<Standalone<StringRef>>> messageReads;
messageReads.reserve(commitLocations.size());
for (const auto& pair : commitLocations) {
messageReads.push_back(self->rawPersistentQueue->read(pair.first, pair.second, CheckHashes::True));
}
commitLocations.clear();
wait(waitForAll(messageReads));
state Version lastRefMessageVersion = 0;
state int index = 0;
loop {
if (index >= messageReads.size())
break;
Standalone<StringRef> queueEntryData = messageReads[index].get();
uint8_t valid;
const uint32_t length = *(uint32_t*)queueEntryData.begin();
queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4);
BinaryReader rd(queueEntryData, IncludeVersion());
state TLogQueueEntry entry;
rd >> entry >> valid;
ASSERT(valid == 0x01);
ASSERT(length + sizeof(valid) == queueEntryData.size());
state Version lastRefMessageVersion = 0;
state int index = 0;
loop {
if (index >= messageReads.size())
break;
Standalone<StringRef> queueEntryData = messageReads[index].get();
uint8_t valid;
const uint32_t length = *(uint32_t*)queueEntryData.begin();
queueEntryData = queueEntryData.substr(4, queueEntryData.size() - 4);
BinaryReader rd(queueEntryData, IncludeVersion());
state TLogQueueEntry entry;
rd >> entry >> valid;
ASSERT(valid == 0x01);
ASSERT(length + sizeof(valid) == queueEntryData.size());
messages << VERSION_HEADER << entry.version;
messages << VERSION_HEADER << entry.version;
std::vector<StringRef> rawMessages =
wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags));
for (const StringRef& msg : rawMessages) {
messages.serializeBytes(msg);
DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg, logData->logId)
.detail("DebugID", self->dbgid)
.detail("PeekTag", reqTag);
std::vector<StringRef> rawMessages =
wait(parseMessagesForTag(entry.messages, reqTag, logData->logRouterTags));
for (const StringRef& msg : rawMessages) {
messages.serializeBytes(msg);
DEBUG_TAGS_AND_MESSAGE("TLogPeekFromDisk", entry.version, msg, logData->logId)
.detail("DebugID", self->dbgid)
.detail("PeekTag", reqTag);
}
lastRefMessageVersion = entry.version;
index++;
}
lastRefMessageVersion = entry.version;
index++;
}
messageReads.clear();
memoryReservation.release();
messageReads.clear();
memoryReservation.release();
if (earlyEnd) {
endVersion = lastRefMessageVersion + 1;
onlySpilled = true;
} else {
messages.serializeBytes(messages2.toValue());
if (earlyEnd) {
endVersion = lastRefMessageVersion + 1;
onlySpilled = true;
} else {
messages.serializeBytes(messages2.toValue());
}
}
}
} else {
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
if (reqOnlySpilled) {
endVersion = logData->persistentDataDurableVersion + 1;
} else {
peekMessagesFromMemory(logData, reqTag, reqBegin, messages, endVersion);
}
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
}
//TraceEvent("TLogPeekResults", self->dbgid).detail("ForAddress", replyPromise.getEndpoint().getPrimaryAddress()).detail("MessageBytes", messages.getLength()).detail("NextEpoch", next_pos.epoch).detail("NextSeq", next_pos.sequence).detail("NowSeq", self->sequence.getNextSequence());
// Reply the peek request when
// - Have data return to the caller, or
// - Batching empty peek is disabled, or
// - Batching empty peek interval has been reached.
if (messages.getLength() > 0 || !SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG ||
(now() - blockStart > SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL)) {
break;
}
state Version waitUntilVersion = logData->version.get() + 1;
// Currently, from `reqBegin` to logData->version are all empty peeks. Wait for more versions, or the empty
// batching interval has expired.
wait(logData->version.whenAtLeast(waitUntilVersion) ||
delay(SERVER_KNOBS->PEEK_BATCHING_EMPTY_MSG_INTERVAL - (now() - blockStart)));
if (logData->version.get() < waitUntilVersion) {
break; // We know that from `reqBegin` to logData->version are all empty messages. Skip re-executing the
// peek logic.
}
}
TLogPeekReply reply;
@ -1969,8 +1997,8 @@ ACTOR Future<Void> tLogPeekStream(TLogData* self, TLogPeekStreamRequest req, Ref
} catch (Error& e) {
self->activePeekStreams--;
TraceEvent(SevDebug, "TLogPeekStreamEnd", logData->logId)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress())
.error(e, true);
.errorUnsuppressed(e)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
if (e.code() == error_code_end_of_stream || e.code() == error_code_operation_obsolete) {
req.reply.sendError(e);
@ -2441,7 +2469,7 @@ ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Refer
}
snapReq.reply.send(Void());
} catch (Error& e) {
TraceEvent("TLogExecHelperError").error(e, true /*includeCancelled */);
TraceEvent("TLogExecHelperError").errorUnsuppressed(e);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
@ -3158,7 +3186,7 @@ bool tlogTerminated(TLogData* self, IKeyValueStore* persistentData, TLogQueue* p
if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed ||
e.code() == error_code_file_not_found || e.code() == error_code_invalid_cluster_id) {
TraceEvent("TLogTerminated", self->dbgid).error(e, true);
TraceEvent("TLogTerminated", self->dbgid).errorUnsuppressed(e);
return true;
} else
return false;
@ -3509,7 +3537,7 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
}
} catch (Error& e) {
self.terminated.send(Void());
TraceEvent("TLogError", tlogId).error(e, true);
TraceEvent("TLogError", tlogId).errorUnsuppressed(e);
if (recovered.canBeSet())
recovered.send(Void());

View File

@ -0,0 +1,598 @@
/*
* TagThrottler.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/TagThrottler.h"
class RkTagThrottleCollection : NonCopyable {
struct RkTagData {
Smoother requestRate;
RkTagData() : requestRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {}
};
struct RkTagThrottleData {
ClientTagThrottleLimits limits;
Smoother clientRate;
// Only used by auto-throttles
double created = now();
double lastUpdated = 0;
double lastReduced = now();
bool rateSet = false;
RkTagThrottleData() : clientRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {}
double getTargetRate(Optional<double> requestRate) {
if (limits.tpsRate == 0.0 || !requestRate.present() || requestRate.get() == 0.0 || !rateSet) {
return limits.tpsRate;
} else {
return std::min(limits.tpsRate, (limits.tpsRate / requestRate.get()) * clientRate.smoothTotal());
}
}
Optional<double> updateAndGetClientRate(Optional<double> requestRate) {
if (limits.expiration > now()) {
double targetRate = getTargetRate(requestRate);
if (targetRate == std::numeric_limits<double>::max()) {
rateSet = false;
return targetRate;
}
if (!rateSet) {
rateSet = true;
clientRate.reset(targetRate);
} else {
clientRate.setTotal(targetRate);
}
double rate = clientRate.smoothTotal();
ASSERT(rate >= 0);
return rate;
} else {
TEST(true); // Get throttle rate for expired throttle
rateSet = false;
return Optional<double>();
}
}
};
void initializeTag(TransactionTag const& tag) { tagData.try_emplace(tag); }
public:
RkTagThrottleCollection() {}
RkTagThrottleCollection(RkTagThrottleCollection&& other) {
autoThrottledTags = std::move(other.autoThrottledTags);
manualThrottledTags = std::move(other.manualThrottledTags);
tagData = std::move(other.tagData);
}
void operator=(RkTagThrottleCollection&& other) {
autoThrottledTags = std::move(other.autoThrottledTags);
manualThrottledTags = std::move(other.manualThrottledTags);
tagData = std::move(other.tagData);
}
double computeTargetTpsRate(double currentBusyness, double targetBusyness, double requestRate) {
ASSERT(currentBusyness > 0);
if (targetBusyness < 1) {
double targetFraction = targetBusyness * (1 - currentBusyness) / ((1 - targetBusyness) * currentBusyness);
return requestRate * targetFraction;
} else {
return std::numeric_limits<double>::max();
}
}
// Returns the TPS rate if the throttle is updated, otherwise returns an empty optional
Optional<double> autoThrottleTag(UID id,
TransactionTag const& tag,
double fractionalBusyness,
Optional<double> tpsRate = Optional<double>(),
Optional<double> expiration = Optional<double>()) {
ASSERT(!tpsRate.present() || tpsRate.get() >= 0);
ASSERT(!expiration.present() || expiration.get() > now());
auto itr = autoThrottledTags.find(tag);
bool present = (itr != autoThrottledTags.end());
if (!present) {
if (autoThrottledTags.size() >= SERVER_KNOBS->MAX_AUTO_THROTTLED_TRANSACTION_TAGS) {
TEST(true); // Reached auto-throttle limit
return Optional<double>();
}
itr = autoThrottledTags.try_emplace(tag).first;
initializeTag(tag);
} else if (itr->second.limits.expiration <= now()) {
TEST(true); // Re-throttling expired tag that hasn't been cleaned up
present = false;
itr->second = RkTagThrottleData();
}
auto& throttle = itr->second;
if (!tpsRate.present()) {
if (now() <= throttle.created + SERVER_KNOBS->AUTO_TAG_THROTTLE_START_AGGREGATION_TIME) {
tpsRate = std::numeric_limits<double>::max();
if (present) {
return Optional<double>();
}
} else if (now() <= throttle.lastUpdated + SERVER_KNOBS->AUTO_TAG_THROTTLE_UPDATE_FREQUENCY) {
TEST(true); // Tag auto-throttled too quickly
return Optional<double>();
} else {
tpsRate = computeTargetTpsRate(fractionalBusyness,
SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS,
tagData[tag].requestRate.smoothRate());
if (throttle.limits.expiration > now() && tpsRate.get() >= throttle.limits.tpsRate) {
TEST(true); // Tag auto-throttle rate increase attempt while active
return Optional<double>();
}
throttle.lastUpdated = now();
if (tpsRate.get() < throttle.limits.tpsRate) {
throttle.lastReduced = now();
}
}
}
if (!expiration.present()) {
expiration = now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION;
}
ASSERT(tpsRate.present() && tpsRate.get() >= 0);
throttle.limits.tpsRate = tpsRate.get();
throttle.limits.expiration = expiration.get();
Optional<double> clientRate = throttle.updateAndGetClientRate(getRequestRate(tag));
TraceEvent("RkSetAutoThrottle", id)
.detail("Tag", tag)
.detail("TargetRate", tpsRate.get())
.detail("Expiration", expiration.get() - now())
.detail("ClientRate", clientRate)
.detail("Created", now() - throttle.created)
.detail("LastUpdate", now() - throttle.lastUpdated)
.detail("LastReduced", now() - throttle.lastReduced);
if (tpsRate.get() != std::numeric_limits<double>::max()) {
return tpsRate.get();
} else {
return Optional<double>();
}
}
void manualThrottleTag(UID id,
TransactionTag const& tag,
TransactionPriority priority,
double tpsRate,
double expiration,
Optional<ClientTagThrottleLimits> const& oldLimits) {
ASSERT(tpsRate >= 0);
ASSERT(expiration > now());
auto& priorityThrottleMap = manualThrottledTags[tag];
auto result = priorityThrottleMap.try_emplace(priority);
initializeTag(tag);
ASSERT(result.second); // Updating to the map is done by copying the whole map
result.first->second.limits.tpsRate = tpsRate;
result.first->second.limits.expiration = expiration;
if (!oldLimits.present()) {
TEST(true); // Transaction tag manually throttled
TraceEvent("RatekeeperAddingManualThrottle", id)
.detail("Tag", tag)
.detail("Rate", tpsRate)
.detail("Priority", transactionPriorityToString(priority))
.detail("SecondsToExpiration", expiration - now());
} else if (oldLimits.get().tpsRate != tpsRate || oldLimits.get().expiration != expiration) {
TEST(true); // Manual transaction tag throttle updated
TraceEvent("RatekeeperUpdatingManualThrottle", id)
.detail("Tag", tag)
.detail("Rate", tpsRate)
.detail("Priority", transactionPriorityToString(priority))
.detail("SecondsToExpiration", expiration - now());
}
Optional<double> clientRate = result.first->second.updateAndGetClientRate(getRequestRate(tag));
ASSERT(clientRate.present());
}
Optional<ClientTagThrottleLimits> getManualTagThrottleLimits(TransactionTag const& tag,
TransactionPriority priority) {
auto itr = manualThrottledTags.find(tag);
if (itr != manualThrottledTags.end()) {
auto priorityItr = itr->second.find(priority);
if (priorityItr != itr->second.end()) {
return priorityItr->second.limits;
}
}
return Optional<ClientTagThrottleLimits>();
}
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates(bool autoThrottlingEnabled) {
PrioritizedTransactionTagMap<ClientTagThrottleLimits> clientRates;
for (auto tagItr = tagData.begin(); tagItr != tagData.end();) {
bool tagPresent = false;
double requestRate = tagItr->second.requestRate.smoothRate();
auto manualItr = manualThrottledTags.find(tagItr->first);
if (manualItr != manualThrottledTags.end()) {
Optional<ClientTagThrottleLimits> manualClientRate;
for (auto priority = allTransactionPriorities.rbegin(); !(priority == allTransactionPriorities.rend());
++priority) {
auto priorityItr = manualItr->second.find(*priority);
if (priorityItr != manualItr->second.end()) {
Optional<double> priorityClientRate = priorityItr->second.updateAndGetClientRate(requestRate);
if (!priorityClientRate.present()) {
TEST(true); // Manual priority throttle expired
priorityItr = manualItr->second.erase(priorityItr);
} else {
if (!manualClientRate.present() ||
manualClientRate.get().tpsRate > priorityClientRate.get()) {
manualClientRate = ClientTagThrottleLimits(priorityClientRate.get(),
priorityItr->second.limits.expiration);
} else {
TEST(true); // Manual throttle overriden by higher priority
}
++priorityItr;
}
}
if (manualClientRate.present()) {
tagPresent = true;
TEST(true); // Using manual throttle
clientRates[*priority][tagItr->first] = manualClientRate.get();
}
}
if (manualItr->second.empty()) {
TEST(true); // All manual throttles expired
manualThrottledTags.erase(manualItr);
break;
}
}
auto autoItr = autoThrottledTags.find(tagItr->first);
if (autoItr != autoThrottledTags.end()) {
Optional<double> autoClientRate = autoItr->second.updateAndGetClientRate(requestRate);
if (autoClientRate.present()) {
double adjustedRate = autoClientRate.get();
double rampStartTime = autoItr->second.lastReduced + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION -
SERVER_KNOBS->AUTO_TAG_THROTTLE_RAMP_UP_TIME;
if (now() >= rampStartTime && adjustedRate != std::numeric_limits<double>::max()) {
TEST(true); // Tag auto-throttle ramping up
double targetBusyness = SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS;
if (targetBusyness == 0) {
targetBusyness = 0.01;
}
double rampLocation = (now() - rampStartTime) / SERVER_KNOBS->AUTO_TAG_THROTTLE_RAMP_UP_TIME;
adjustedRate =
computeTargetTpsRate(targetBusyness, pow(targetBusyness, 1 - rampLocation), adjustedRate);
}
tagPresent = true;
if (autoThrottlingEnabled) {
auto result = clientRates[TransactionPriority::DEFAULT].try_emplace(
tagItr->first, adjustedRate, autoItr->second.limits.expiration);
if (!result.second && result.first->second.tpsRate > adjustedRate) {
result.first->second =
ClientTagThrottleLimits(adjustedRate, autoItr->second.limits.expiration);
} else {
TEST(true); // Auto throttle overriden by manual throttle
}
clientRates[TransactionPriority::BATCH][tagItr->first] =
ClientTagThrottleLimits(0, autoItr->second.limits.expiration);
}
} else {
ASSERT(autoItr->second.limits.expiration <= now());
TEST(true); // Auto throttle expired
if (BUGGIFY) { // Temporarily extend the window between expiration and cleanup
tagPresent = true;
} else {
autoThrottledTags.erase(autoItr);
}
}
}
if (!tagPresent) {
TEST(true); // All tag throttles expired
tagItr = tagData.erase(tagItr);
} else {
++tagItr;
}
}
return clientRates;
}
void addRequests(TransactionTag const& tag, int requests) {
if (requests > 0) {
TEST(true); // Requests reported for throttled tag
auto tagItr = tagData.try_emplace(tag);
tagItr.first->second.requestRate.addDelta(requests);
double requestRate = tagItr.first->second.requestRate.smoothRate();
auto autoItr = autoThrottledTags.find(tag);
if (autoItr != autoThrottledTags.end()) {
autoItr->second.updateAndGetClientRate(requestRate);
}
auto manualItr = manualThrottledTags.find(tag);
if (manualItr != manualThrottledTags.end()) {
for (auto priorityItr = manualItr->second.begin(); priorityItr != manualItr->second.end();
++priorityItr) {
priorityItr->second.updateAndGetClientRate(requestRate);
}
}
}
}
Optional<double> getRequestRate(TransactionTag const& tag) {
auto itr = tagData.find(tag);
if (itr != tagData.end()) {
return itr->second.requestRate.smoothRate();
}
return Optional<double>();
}
int64_t autoThrottleCount() const { return autoThrottledTags.size(); }
int64_t manualThrottleCount() const {
int64_t count = 0;
for (auto itr = manualThrottledTags.begin(); itr != manualThrottledTags.end(); ++itr) {
count += itr->second.size();
}
return count;
}
TransactionTagMap<RkTagThrottleData> autoThrottledTags;
TransactionTagMap<std::map<TransactionPriority, RkTagThrottleData>> manualThrottledTags;
TransactionTagMap<RkTagData> tagData;
uint32_t busyReadTagCount = 0, busyWriteTagCount = 0;
};
class TagThrottlerImpl {
Database db;
UID id;
RkTagThrottleCollection throttledTags;
uint64_t throttledTagChangeId{ 0 };
bool autoThrottlingEnabled{ false };
ACTOR static Future<Void> monitorThrottlingChanges(TagThrottlerImpl* self) {
state bool committed = false;
loop {
state ReadYourWritesTransaction tr(self->db);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Future<RangeResult> throttledTagKeys = tr.getRange(tagThrottleKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> autoThrottlingEnabled = tr.get(tagThrottleAutoEnabledKey);
if (!committed) {
BinaryWriter limitWriter(Unversioned());
limitWriter << SERVER_KNOBS->MAX_MANUAL_THROTTLED_TRANSACTION_TAGS;
tr.set(tagThrottleLimitKey, limitWriter.toValue());
}
wait(success(throttledTagKeys) && success(autoThrottlingEnabled));
if (autoThrottlingEnabled.get().present() &&
autoThrottlingEnabled.get().get() == LiteralStringRef("0")) {
TEST(true); // Auto-throttling disabled
if (self->autoThrottlingEnabled) {
TraceEvent("AutoTagThrottlingDisabled", self->id).log();
}
self->autoThrottlingEnabled = false;
} else if (autoThrottlingEnabled.get().present() &&
autoThrottlingEnabled.get().get() == LiteralStringRef("1")) {
TEST(true); // Auto-throttling enabled
if (!self->autoThrottlingEnabled) {
TraceEvent("AutoTagThrottlingEnabled", self->id).log();
}
self->autoThrottlingEnabled = true;
} else {
TEST(true); // Auto-throttling unspecified
if (autoThrottlingEnabled.get().present()) {
TraceEvent(SevWarnAlways, "InvalidAutoTagThrottlingValue", self->id)
.detail("Value", autoThrottlingEnabled.get().get());
}
self->autoThrottlingEnabled = SERVER_KNOBS->AUTO_TAG_THROTTLING_ENABLED;
if (!committed)
tr.set(tagThrottleAutoEnabledKey,
LiteralStringRef(self->autoThrottlingEnabled ? "1" : "0"));
}
RkTagThrottleCollection updatedTagThrottles;
TraceEvent("RatekeeperReadThrottledTags", self->id)
.detail("NumThrottledTags", throttledTagKeys.get().size());
for (auto entry : throttledTagKeys.get()) {
TagThrottleKey tagKey = TagThrottleKey::fromKey(entry.key);
TagThrottleValue tagValue = TagThrottleValue::fromValue(entry.value);
ASSERT(tagKey.tags.size() == 1); // Currently, only 1 tag per throttle is supported
if (tagValue.expirationTime == 0 ||
tagValue.expirationTime > now() + tagValue.initialDuration) {
TEST(true); // Converting tag throttle duration to absolute time
tagValue.expirationTime = now() + tagValue.initialDuration;
BinaryWriter wr(IncludeVersion(ProtocolVersion::withTagThrottleValueReason()));
wr << tagValue;
state Value value = wr.toValue();
tr.set(entry.key, value);
}
if (tagValue.expirationTime > now()) {
TransactionTag tag = *tagKey.tags.begin();
Optional<ClientTagThrottleLimits> oldLimits =
self->throttledTags.getManualTagThrottleLimits(tag, tagKey.priority);
if (tagKey.throttleType == TagThrottleType::AUTO) {
updatedTagThrottles.autoThrottleTag(
self->id, tag, 0, tagValue.tpsRate, tagValue.expirationTime);
if (tagValue.reason == TagThrottledReason::BUSY_READ) {
updatedTagThrottles.busyReadTagCount++;
} else if (tagValue.reason == TagThrottledReason::BUSY_WRITE) {
updatedTagThrottles.busyWriteTagCount++;
}
} else {
updatedTagThrottles.manualThrottleTag(self->id,
tag,
tagKey.priority,
tagValue.tpsRate,
tagValue.expirationTime,
oldLimits);
}
}
}
self->throttledTags = std::move(updatedTagThrottles);
++self->throttledTagChangeId;
state Future<Void> watchFuture = tr.watch(tagThrottleSignalKey);
wait(tr.commit());
committed = true;
wait(watchFuture);
TraceEvent("RatekeeperThrottleSignaled", self->id).log();
TEST(true); // Tag throttle changes detected
break;
} catch (Error& e) {
TraceEvent("RatekeeperMonitorThrottlingChangesError", self->id).error(e);
wait(tr.onError(e));
}
}
}
}
Optional<double> autoThrottleTag(UID id, TransactionTag tag, double busyness) {
return throttledTags.autoThrottleTag(id, tag, busyness);
}
Future<Void> tryAutoThrottleTag(TransactionTag tag, double rate, double busyness, TagThrottledReason reason) {
// NOTE: before the comparison with MIN_TAG_COST, the busiest tag rate also compares with MIN_TAG_PAGES_RATE
// currently MIN_TAG_PAGES_RATE > MIN_TAG_COST in our default knobs.
if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) {
TEST(true); // Transaction tag auto-throttled
Optional<double> clientRate = autoThrottleTag(id, tag, busyness);
if (clientRate.present()) {
TagSet tags;
tags.addTag(tag);
Reference<DatabaseContext> dbRef = Reference<DatabaseContext>::addRef(db.getPtr());
return ThrottleApi::throttleTags(dbRef,
tags,
clientRate.get(),
SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION,
TagThrottleType::AUTO,
TransactionPriority::DEFAULT,
now() + SERVER_KNOBS->AUTO_TAG_THROTTLE_DURATION,
reason);
}
}
return Void();
}
public:
TagThrottlerImpl(Database db, UID id) : db(db), id(id) {}
Future<Void> monitorThrottlingChanges() { return monitorThrottlingChanges(this); }
void addRequests(TransactionTag tag, int count) { throttledTags.addRequests(tag, count); }
uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; }
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() {
return throttledTags.getClientRates(autoThrottlingEnabled);
}
int64_t autoThrottleCount() const { return throttledTags.autoThrottleCount(); }
uint32_t busyReadTagCount() const { return throttledTags.busyReadTagCount; }
uint32_t busyWriteTagCount() const { return throttledTags.busyWriteTagCount; }
int64_t manualThrottleCount() const { return throttledTags.manualThrottleCount(); }
bool isAutoThrottlingEnabled() const { return autoThrottlingEnabled; }
Future<Void> tryAutoThrottleTag(StorageQueueInfo& ss, int64_t storageQueue, int64_t storageDurabilityLag) {
// NOTE: we just keep it simple and don't differentiate write-saturation and read-saturation at the moment. In
// most of situation, this works. More indicators besides queue size and durability lag could be investigated in
// the future
if (storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES ||
storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS) {
if (ss.busiestWriteTag.present()) {
return tryAutoThrottleTag(ss.busiestWriteTag.get(),
ss.busiestWriteTagRate,
ss.busiestWriteTagFractionalBusyness,
TagThrottledReason::BUSY_WRITE);
}
if (ss.busiestReadTag.present()) {
return tryAutoThrottleTag(ss.busiestReadTag.get(),
ss.busiestReadTagRate,
ss.busiestReadTagFractionalBusyness,
TagThrottledReason::BUSY_READ);
}
}
return Void();
}
}; // class TagThrottlerImpl
TagThrottler::TagThrottler(Database db, UID id) : impl(PImpl<TagThrottlerImpl>::create(db, id)) {}
TagThrottler::~TagThrottler() = default;
Future<Void> TagThrottler::monitorThrottlingChanges() {
return impl->monitorThrottlingChanges();
}
void TagThrottler::addRequests(TransactionTag tag, int count) {
impl->addRequests(tag, count);
}
uint64_t TagThrottler::getThrottledTagChangeId() const {
return impl->getThrottledTagChangeId();
}
PrioritizedTransactionTagMap<ClientTagThrottleLimits> TagThrottler::getClientRates() {
return impl->getClientRates();
}
int64_t TagThrottler::autoThrottleCount() const {
return impl->autoThrottleCount();
}
uint32_t TagThrottler::busyReadTagCount() const {
return impl->busyReadTagCount();
}
uint32_t TagThrottler::busyWriteTagCount() const {
return impl->busyWriteTagCount();
}
int64_t TagThrottler::manualThrottleCount() const {
return impl->manualThrottleCount();
}
bool TagThrottler::isAutoThrottlingEnabled() const {
return impl->isAutoThrottlingEnabled();
}
Future<Void> TagThrottler::tryAutoThrottleTag(StorageQueueInfo& ss,
int64_t storageQueue,
int64_t storageDurabilityLag) {
return impl->tryAutoThrottleTag(ss, storageQueue, storageDurabilityLag);
}

42
fdbserver/TagThrottler.h Normal file
View File

@ -0,0 +1,42 @@
/*
* TagThrottler.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/PImpl.h"
#include "fdbserver/Ratekeeper.h"
class TagThrottler {
PImpl<class TagThrottlerImpl> impl;
public:
TagThrottler(Database db, UID id);
~TagThrottler();
Future<Void> monitorThrottlingChanges();
void addRequests(TransactionTag tag, int count);
uint64_t getThrottledTagChangeId() const;
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates();
int64_t autoThrottleCount() const;
uint32_t busyReadTagCount() const;
uint32_t busyWriteTagCount() const;
int64_t manualThrottleCount() const;
bool isAutoThrottlingEnabled() const;
Future<Void> tryAutoThrottleTag(StorageQueueInfo&, int64_t storageQueue, int64_t storageDurabilityLag);
};

Some files were not shown because too many files have changed in this diff Show More